Merge tag 'sysctl-6.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/mcgrof...

[platform/kernel/linux-starfive.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 8e029a6..c36aa54 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2976,7 +2976,7 @@ static void task_numa_work(struct callback_head *work)
         }
  
         next_scan = now + msecs_to_jiffies(p->numa_scan_period);
-       if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+       if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan))
                 return;
  
         /*
@@ -4292,14 +4292,16 @@ static inline unsigned long task_util_est(struct task_struct *p)
  }
  
  #ifdef CONFIG_UCLAMP_TASK
-static inline unsigned long uclamp_task_util(struct task_struct *p)
+static inline unsigned long uclamp_task_util(struct task_struct *p,
+                                            unsigned long uclamp_min,
+                                            unsigned long uclamp_max)
  {
-       return clamp(task_util_est(p),
-                    uclamp_eff_value(p, UCLAMP_MIN),
-                    uclamp_eff_value(p, UCLAMP_MAX));
+       return clamp(task_util_est(p), uclamp_min, uclamp_max);
  }
  #else
-static inline unsigned long uclamp_task_util(struct task_struct *p)
+static inline unsigned long uclamp_task_util(struct task_struct *p,
+                                            unsigned long uclamp_min,
+                                            unsigned long uclamp_max)
  {
         return task_util_est(p);
  }
@@ -4438,10 +4440,139 @@ done:
         trace_sched_util_est_se_tp(&p->se);
  }
  
-static inline int task_fits_capacity(struct task_struct *p,
-                                    unsigned long capacity)
+static inline int util_fits_cpu(unsigned long util,
+                               unsigned long uclamp_min,
+                               unsigned long uclamp_max,
+                               int cpu)
  {
-       return fits_capacity(uclamp_task_util(p), capacity);
+       unsigned long capacity_orig, capacity_orig_thermal;
+       unsigned long capacity = capacity_of(cpu);
+       bool fits, uclamp_max_fits;
+
+       /*
+        * Check if the real util fits without any uclamp boost/cap applied.
+        */
+       fits = fits_capacity(util, capacity);
+
+       if (!uclamp_is_used())
+               return fits;
+
+       /*
+        * We must use capacity_orig_of() for comparing against uclamp_min and
+        * uclamp_max. We only care about capacity pressure (by using
+        * capacity_of()) for comparing against the real util.
+        *
+        * If a task is boosted to 1024 for example, we don't want a tiny
+        * pressure to skew the check whether it fits a CPU or not.
+        *
+        * Similarly if a task is capped to capacity_orig_of(little_cpu), it
+        * should fit a little cpu even if there's some pressure.
+        *
+        * Only exception is for thermal pressure since it has a direct impact
+        * on available OPP of the system.
+        *
+        * We honour it for uclamp_min only as a drop in performance level
+        * could result in not getting the requested minimum performance level.
+        *
+        * For uclamp_max, we can tolerate a drop in performance level as the
+        * goal is to cap the task. So it's okay if it's getting less.
+        *
+        * In case of capacity inversion we should honour the inverted capacity
+        * for both uclamp_min and uclamp_max all the time.
+        */
+       capacity_orig = cpu_in_capacity_inversion(cpu);
+       if (capacity_orig) {
+               capacity_orig_thermal = capacity_orig;
+       } else {
+               capacity_orig = capacity_orig_of(cpu);
+               capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
+       }
+
+       /*
+        * We want to force a task to fit a cpu as implied by uclamp_max.
+        * But we do have some corner cases to cater for..
+        *
+        *
+        *                                 C=z
+        *   |                             ___
+        *   |                  C=y       |   |
+        *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _  uclamp_max
+        *   |      C=x        |   |      |   |
+        *   |      ___        |   |      |   |
+        *   |     |   |       |   |      |   |    (util somewhere in this region)
+        *   |     |   |       |   |      |   |
+        *   |     |   |       |   |      |   |
+        *   +----------------------------------------
+        *         cpu0        cpu1       cpu2
+        *
+        *   In the above example if a task is capped to a specific performance
+        *   point, y, then when:
+        *
+        *   * util = 80% of x then it does not fit on cpu0 and should migrate
+        *     to cpu1
+        *   * util = 80% of y then it is forced to fit on cpu1 to honour
+        *     uclamp_max request.
+        *
+        *   which is what we're enforcing here. A task always fits if
+        *   uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
+        *   the normal upmigration rules should withhold still.
+        *
+        *   Only exception is when we are on max capacity, then we need to be
+        *   careful not to block overutilized state. This is so because:
+        *
+        *     1. There's no concept of capping at max_capacity! We can't go
+        *        beyond this performance level anyway.
+        *     2. The system is being saturated when we're operating near
+        *        max capacity, it doesn't make sense to block overutilized.
+        */
+       uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
+       uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
+       fits = fits || uclamp_max_fits;
+
+       /*
+        *
+        *                                 C=z
+        *   |                             ___       (region a, capped, util >= uclamp_max)
+        *   |                  C=y       |   |
+        *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
+        *   |      C=x        |   |      |   |
+        *   |      ___        |   |      |   |      (region b, uclamp_min <= util <= uclamp_max)
+        *   |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
+        *   |     |   |       |   |      |   |
+        *   |     |   |       |   |      |   |      (region c, boosted, util < uclamp_min)
+        *   +----------------------------------------
+        *         cpu0        cpu1       cpu2
+        *
+        * a) If util > uclamp_max, then we're capped, we don't care about
+        *    actual fitness value here. We only care if uclamp_max fits
+        *    capacity without taking margin/pressure into account.
+        *    See comment above.
+        *
+        * b) If uclamp_min <= util <= uclamp_max, then the normal
+        *    fits_capacity() rules apply. Except we need to ensure that we
+        *    enforce we remain within uclamp_max, see comment above.
+        *
+        * c) If util < uclamp_min, then we are boosted. Same as (b) but we
+        *    need to take into account the boosted value fits the CPU without
+        *    taking margin/pressure into account.
+        *
+        * Cases (a) and (b) are handled in the 'fits' variable already. We
+        * just need to consider an extra check for case (c) after ensuring we
+        * handle the case uclamp_min > uclamp_max.
+        */
+       uclamp_min = min(uclamp_min, uclamp_max);
+       if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
+               fits = fits && (uclamp_min <= capacity_orig_thermal);
+
+       return fits;
+}
+
+static inline int task_fits_cpu(struct task_struct *p, int cpu)
+{
+       unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
+       unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
+       unsigned long util = task_util_est(p);
+       return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
  }
  
  static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -4454,7 +4585,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
                 return;
         }
  
-       if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
+       if (task_fits_cpu(p, cpu_of(rq))) {
                 rq->misfit_task_load = 0;
                 return;
         }
@@ -5874,7 +6005,10 @@ static inline void hrtick_update(struct rq *rq)
  #ifdef CONFIG_SMP
  static inline bool cpu_overutilized(int cpu)
  {
-       return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
+       unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+       unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+
+       return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
  }
  
  static inline void update_overutilized_status(struct rq *rq)
@@ -6666,21 +6800,23 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
  static int
  select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
  {
-       unsigned long task_util, best_cap = 0;
+       unsigned long task_util, util_min, util_max, best_cap = 0;
         int cpu, best_cpu = -1;
         struct cpumask *cpus;
  
         cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
         cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
  
-       task_util = uclamp_task_util(p);
+       task_util = task_util_est(p);
+       util_min = uclamp_eff_value(p, UCLAMP_MIN);
+       util_max = uclamp_eff_value(p, UCLAMP_MAX);
  
         for_each_cpu_wrap(cpu, cpus, target) {
                 unsigned long cpu_cap = capacity_of(cpu);
  
                 if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
                         continue;
-               if (fits_capacity(task_util, cpu_cap))
+               if (util_fits_cpu(task_util, util_min, util_max, cpu))
                         return cpu;
  
                 if (cpu_cap > best_cap) {
@@ -6692,10 +6828,13 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
         return best_cpu;
  }
  
-static inline bool asym_fits_capacity(unsigned long task_util, int cpu)
+static inline bool asym_fits_cpu(unsigned long util,
+                                unsigned long util_min,
+                                unsigned long util_max,
+                                int cpu)
  {
         if (sched_asym_cpucap_active())
-               return fits_capacity(task_util, capacity_of(cpu));
+               return util_fits_cpu(util, util_min, util_max, cpu);
  
         return true;
  }
@@ -6707,7 +6846,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
  {
         bool has_idle_core = false;
         struct sched_domain *sd;
-       unsigned long task_util;
+       unsigned long task_util, util_min, util_max;
         int i, recent_used_cpu;
  
         /*
@@ -6716,7 +6855,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
          */
         if (sched_asym_cpucap_active()) {
                 sync_entity_load_avg(&p->se);
-               task_util = uclamp_task_util(p);
+               task_util = task_util_est(p);
+               util_min = uclamp_eff_value(p, UCLAMP_MIN);
+               util_max = uclamp_eff_value(p, UCLAMP_MAX);
         }
  
         /*
@@ -6725,7 +6866,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
         lockdep_assert_irqs_disabled();
  
         if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
-           asym_fits_capacity(task_util, target))
+           asym_fits_cpu(task_util, util_min, util_max, target))
                 return target;
  
         /*
@@ -6733,7 +6874,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
          */
         if (prev != target && cpus_share_cache(prev, target) &&
             (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
-           asym_fits_capacity(task_util, prev))
+           asym_fits_cpu(task_util, util_min, util_max, prev))
                 return prev;
  
         /*
@@ -6748,7 +6889,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
             in_task() &&
             prev == smp_processor_id() &&
             this_rq()->nr_running <= 1 &&
-           asym_fits_capacity(task_util, prev)) {
+           asym_fits_cpu(task_util, util_min, util_max, prev)) {
                 return prev;
         }
  
@@ -6760,7 +6901,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
             cpus_share_cache(recent_used_cpu, target) &&
             (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
             cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
-           asym_fits_capacity(task_util, recent_used_cpu)) {
+           asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
                 return recent_used_cpu;
         }
  
@@ -7056,6 +7197,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
  {
         struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
         unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
+       unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
+       unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
         struct root_domain *rd = this_rq()->rd;
         int cpu, best_energy_cpu, target = -1;
         struct sched_domain *sd;
@@ -7080,7 +7223,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
         target = prev_cpu;
  
         sync_entity_load_avg(&p->se);
-       if (!task_util_est(p))
+       if (!uclamp_task_util(p, p_util_min, p_util_max))
                 goto unlock;
  
         eenv_task_busy_time(&eenv, p, prev_cpu);
@@ -7088,7 +7231,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
         for (; pd; pd = pd->next) {
                 unsigned long cpu_cap, cpu_thermal_cap, util;
                 unsigned long cur_delta, max_spare_cap = 0;
-               bool compute_prev_delta = false;
+               unsigned long rq_util_min, rq_util_max;
+               unsigned long util_min, util_max;
+               unsigned long prev_spare_cap = 0;
                 int max_spare_cap_cpu = -1;
                 unsigned long base_energy;
  
@@ -7124,26 +7269,45 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                          * much capacity we can get out of the CPU; this is
                          * aligned with sched_cpu_util().
                          */
-                       util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
-                       if (!fits_capacity(util, cpu_cap))
+                       if (uclamp_is_used()) {
+                               if (uclamp_rq_is_idle(cpu_rq(cpu))) {
+                                       util_min = p_util_min;
+                                       util_max = p_util_max;
+                               } else {
+                                       /*
+                                        * Open code uclamp_rq_util_with() except for
+                                        * the clamp() part. Ie: apply max aggregation
+                                        * only. util_fits_cpu() logic requires to
+                                        * operate on non clamped util but must use the
+                                        * max-aggregated uclamp_{min, max}.
+                                        */
+                                       rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+                                       rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+
+                                       util_min = max(rq_util_min, p_util_min);
+                                       util_max = max(rq_util_max, p_util_max);
+                               }
+                       }
+                       if (!util_fits_cpu(util, util_min, util_max, cpu))
                                 continue;
  
                         lsub_positive(&cpu_cap, util);
  
                         if (cpu == prev_cpu) {
                                 /* Always use prev_cpu as a candidate. */
-                               compute_prev_delta = true;
+                               prev_spare_cap = cpu_cap;
                         } else if (cpu_cap > max_spare_cap) {
                                 /*
                                  * Find the CPU with the maximum spare capacity
-                                * in the performance domain.
+                                * among the remaining CPUs in the performance
+                                * domain.
                                  */
                                 max_spare_cap = cpu_cap;
                                 max_spare_cap_cpu = cpu;
                         }
                 }
  
-               if (max_spare_cap_cpu < 0 && !compute_prev_delta)
+               if (max_spare_cap_cpu < 0 && prev_spare_cap == 0)
                         continue;
  
                 eenv_pd_busy_time(&eenv, cpus, p);
@@ -7151,7 +7315,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                 base_energy = compute_energy(&eenv, pd, cpus, p, -1);
  
                 /* Evaluate the energy impact of using prev_cpu. */
-               if (compute_prev_delta) {
+               if (prev_spare_cap > 0) {
                         prev_delta = compute_energy(&eenv, pd, cpus, p,
                                                     prev_cpu);
                         /* CPU utilization has changed */
@@ -7162,7 +7326,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                 }
  
                 /* Evaluate the energy impact of using max_spare_cap_cpu. */
-               if (max_spare_cap_cpu >= 0) {
+               if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
                         cur_delta = compute_energy(&eenv, pd, cpus, p,
                                                    max_spare_cap_cpu);
                         /* CPU utilization has changed */
@@ -8288,7 +8452,7 @@ static int detach_tasks(struct lb_env *env)
  
                 case migrate_misfit:
                         /* This is not a misfit task */
-                       if (task_fits_capacity(p, capacity_of(env->src_cpu)))
+                       if (task_fits_cpu(p, env->src_cpu))
                                 goto next;
  
                         env->imbalance = 0;
@@ -8677,16 +8841,73 @@ static unsigned long scale_rt_capacity(int cpu)
  
  static void update_cpu_capacity(struct sched_domain *sd, int cpu)
  {
+       unsigned long capacity_orig = arch_scale_cpu_capacity(cpu);
         unsigned long capacity = scale_rt_capacity(cpu);
         struct sched_group *sdg = sd->groups;
+       struct rq *rq = cpu_rq(cpu);
  
-       cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
+       rq->cpu_capacity_orig = capacity_orig;
  
         if (!capacity)
                 capacity = 1;
  
-       cpu_rq(cpu)->cpu_capacity = capacity;
-       trace_sched_cpu_capacity_tp(cpu_rq(cpu));
+       rq->cpu_capacity = capacity;
+
+       /*
+        * Detect if the performance domain is in capacity inversion state.
+        *
+        * Capacity inversion happens when another perf domain with equal or
+        * lower capacity_orig_of() ends up having higher capacity than this
+        * domain after subtracting thermal pressure.
+        *
+        * We only take into account thermal pressure in this detection as it's
+        * the only metric that actually results in *real* reduction of
+        * capacity due to performance points (OPPs) being dropped/become
+        * unreachable due to thermal throttling.
+        *
+        * We assume:
+        *   * That all cpus in a perf domain have the same capacity_orig
+        *     (same uArch).
+        *   * Thermal pressure will impact all cpus in this perf domain
+        *     equally.
+        */
+       if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+               unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
+               struct perf_domain *pd = rcu_dereference(rq->rd->pd);
+
+               rq->cpu_capacity_inverted = 0;
+
+               for (; pd; pd = pd->next) {
+                       struct cpumask *pd_span = perf_domain_span(pd);
+                       unsigned long pd_cap_orig, pd_cap;
+
+                       cpu = cpumask_any(pd_span);
+                       pd_cap_orig = arch_scale_cpu_capacity(cpu);
+
+                       if (capacity_orig < pd_cap_orig)
+                               continue;
+
+                       /*
+                        * handle the case of multiple perf domains have the
+                        * same capacity_orig but one of them is under higher
+                        * thermal pressure. We record it as capacity
+                        * inversion.
+                        */
+                       if (capacity_orig == pd_cap_orig) {
+                               pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu));
+
+                               if (pd_cap > inv_cap) {
+                                       rq->cpu_capacity_inverted = inv_cap;
+                                       break;
+                               }
+                       } else if (pd_cap_orig > inv_cap) {
+                               rq->cpu_capacity_inverted = inv_cap;
+                               break;
+                       }
+               }
+       }
+
+       trace_sched_cpu_capacity_tp(rq);
  
         sdg->sgc->capacity = capacity;
         sdg->sgc->min_capacity = capacity;
@@ -9293,6 +9514,10 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
  
         memset(sgs, 0, sizeof(*sgs));
  
+       /* Assume that task can't fit any CPU of the group */
+       if (sd->flags & SD_ASYM_CPUCAPACITY)
+               sgs->group_misfit_task_load = 1;
+
         for_each_cpu(i, sched_group_span(group)) {
                 struct rq *rq = cpu_rq(i);
                 unsigned int local;
@@ -9312,12 +9537,12 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
                 if (!nr_running && idle_cpu_without(i, p))
                         sgs->idle_cpus++;
  
-       }
+               /* Check if task fits in the CPU */
+               if (sd->flags & SD_ASYM_CPUCAPACITY &&
+                   sgs->group_misfit_task_load &&
+                   task_fits_cpu(p, i))
+                       sgs->group_misfit_task_load = 0;
  
-       /* Check if task fits in the group */
-       if (sd->flags & SD_ASYM_CPUCAPACITY &&
-           !task_fits_capacity(p, group->sgc->max_capacity)) {
-               sgs->group_misfit_task_load = 1;
         }
  
         sgs->group_capacity = group->sgc->capacity;