patch-5.15.79-rt54.patch

[platform/kernel/linux-rpi.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index f6a05d9..0e13c85 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3422,7 +3422,6 @@ void set_task_rq_fair(struct sched_entity *se,
         se->avg.last_update_time = n_last_update_time;
  }
  
-
  /*
   * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
   * propagate its contribution. The key to this propagation is the invariant
@@ -3490,7 +3489,6 @@ void set_task_rq_fair(struct sched_entity *se,
   * XXX: only do this for the part of runnable > running ?
   *
   */
-
  static inline void
  update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
  {
@@ -3722,7 +3720,19 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
  
                 r = removed_util;
                 sub_positive(&sa->util_avg, r);
-               sa->util_sum = sa->util_avg * divider;
+               sub_positive(&sa->util_sum, r * divider);
+               /*
+                * Because of rounding, se->util_sum might ends up being +1 more than
+                * cfs->util_sum. Although this is not a problem by itself, detaching
+                * a lot of tasks with the rounding problem between 2 updates of
+                * util_avg (~1ms) can make cfs->util_sum becoming null whereas
+                * cfs_util_avg is not.
+                * Check that util_sum is still above its lower bound for the new
+                * util_avg. Given that period_contrib might have moved since the last
+                * sync, we are only sure that util_sum must be above or equal to
+                *    util_avg * minimum possible divider
+                */
+               sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
  
                 r = removed_runnable;
                 sub_positive(&sa->runnable_avg, r);
@@ -3784,11 +3794,11 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
  
         se->avg.runnable_sum = se->avg.runnable_avg * divider;
  
-       se->avg.load_sum = divider;
-       if (se_weight(se)) {
-               se->avg.load_sum =
-                       div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
-       }
+       se->avg.load_sum = se->avg.load_avg * divider;
+       if (se_weight(se) < se->avg.load_sum)
+               se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
+       else
+               se->avg.load_sum = 1;
  
         enqueue_load_avg(cfs_rq, se);
         cfs_rq->avg.util_avg += se->avg.util_avg;
@@ -4448,7 +4458,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
         ideal_runtime = sched_slice(cfs_rq, curr);
         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
         if (delta_exec > ideal_runtime) {
-               resched_curr(rq_of(cfs_rq));
+               resched_curr_lazy(rq_of(cfs_rq));
                 /*
                  * The current task ran long enough, ensure it doesn't get
                  * re-elected due to buddy favours.
@@ -4472,7 +4482,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
                 return;
  
         if (delta > ideal_runtime)
-               resched_curr(rq_of(cfs_rq));
+               resched_curr_lazy(rq_of(cfs_rq));
  }
  
  static void
@@ -4615,7 +4625,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
          * validating it and just reschedule.
          */
         if (queued) {
-               resched_curr(rq_of(cfs_rq));
+               resched_curr_lazy(rq_of(cfs_rq));
                 return;
         }
         /*
@@ -4755,7 +4765,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
          * hierarchy can be throttled
          */
         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
-               resched_curr(rq_of(cfs_rq));
+               resched_curr_lazy(rq_of(cfs_rq));
  }
  
  static __always_inline
@@ -4802,8 +4812,8 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
  
         cfs_rq->throttle_count--;
         if (!cfs_rq->throttle_count) {
-               cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
-                                            cfs_rq->throttled_clock_task;
+               cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+                                            cfs_rq->throttled_clock_pelt;
  
                 /* Add cfs_rq with load or one or more already running entities to the list */
                 if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running)
@@ -4820,7 +4830,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
  
         /* group is entering throttled state, stop time */
         if (!cfs_rq->throttle_count) {
-               cfs_rq->throttled_clock_task = rq_clock_task(rq);
+               cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
                 list_del_leaf_cfs_rq(cfs_rq);
         }
         cfs_rq->throttle_count++;
@@ -5264,7 +5274,7 @@ static void sync_throttle(struct task_group *tg, int cpu)
         pcfs_rq = tg->parent->cfs_rq[cpu];
  
         cfs_rq->throttle_count = pcfs_rq->throttle_count;
-       cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+       cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
  }
  
  /* conditionally throttle active cfs_rq's from put_prev_entity() */
@@ -5518,7 +5528,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
  
                 if (delta < 0) {
                         if (task_current(rq, p))
-                               resched_curr(rq);
+                               resched_curr_lazy(rq);
                         return;
                 }
                 hrtick_start(rq, delta);
@@ -6270,6 +6280,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
  {
         struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
         int i, cpu, idle_cpu = -1, nr = INT_MAX;
+       struct sched_domain_shared *sd_share;
         struct rq *this_rq = this_rq();
         int this = smp_processor_id();
         struct sched_domain *this_sd;
@@ -6309,6 +6320,17 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
                 time = cpu_clock(this);
         }
  
+       if (sched_feat(SIS_UTIL)) {
+               sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
+               if (sd_share) {
+                       /* because !--nr is the condition to stop scan */
+                       nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
+                       /* overloaded LLC is unlikely to have idle cpu/core */
+                       if (nr == 1)
+                               return -1;
+               }
+       }
+
         for_each_cpu_wrap(cpu, cpus, target + 1) {
                 if (has_idle_core) {
                         i = select_idle_core(p, cpu, cpus, &idle_cpu);
@@ -6429,8 +6451,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
          * pattern is IO completions.
          */
         if (is_per_cpu_kthread(current) &&
+           in_task() &&
             prev == smp_processor_id() &&
-           this_rq()->nr_running <= 1) {
+           this_rq()->nr_running <= 1 &&
+           asym_fits_capacity(task_util, prev)) {
                 return prev;
         }
  
@@ -7208,7 +7232,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
         return;
  
  preempt:
-       resched_curr(rq);
+       resched_curr_lazy(rq);
         /*
          * Only set the backward buddy when the current task is still
          * on the rq. This can happen when a wakeup gets interleaved
@@ -8993,9 +9017,10 @@ static bool update_pick_idlest(struct sched_group *idlest,
   * This is an approximation as the number of running tasks may not be
   * related to the number of busy CPUs due to sched_setaffinity.
   */
-static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
+static inline bool
+allow_numa_imbalance(unsigned int running, unsigned int weight)
  {
-       return (dst_running < (dst_weight >> 2));
+       return (running < (weight >> 2));
  }
  
  /*
@@ -9129,12 +9154,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                                 return idlest;
  #endif
                         /*
-                        * Otherwise, keep the task on this node to stay close
-                        * its wakeup source and improve locality. If there is
-                        * a real need of migration, periodic load balance will
-                        * take care of it.
+                        * Otherwise, keep the task close to the wakeup source
+                        * and improve locality if the number of running tasks
+                        * would remain below threshold where an imbalance is
+                        * allowed. If there is a real need of migration,
+                        * periodic load balance will take care of it.
                          */
-                       if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
+                       if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, local_sgs.group_weight))
                                 return NULL;
                 }
  
@@ -9152,6 +9178,77 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
         return idlest;
  }
  
+static void update_idle_cpu_scan(struct lb_env *env,
+                                unsigned long sum_util)
+{
+       struct sched_domain_shared *sd_share;
+       int llc_weight, pct;
+       u64 x, y, tmp;
+       /*
+        * Update the number of CPUs to scan in LLC domain, which could
+        * be used as a hint in select_idle_cpu(). The update of sd_share
+        * could be expensive because it is within a shared cache line.
+        * So the write of this hint only occurs during periodic load
+        * balancing, rather than CPU_NEWLY_IDLE, because the latter
+        * can fire way more frequently than the former.
+        */
+       if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE)
+               return;
+
+       llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
+       if (env->sd->span_weight != llc_weight)
+               return;
+
+       sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
+       if (!sd_share)
+               return;
+
+       /*
+        * The number of CPUs to search drops as sum_util increases, when
+        * sum_util hits 85% or above, the scan stops.
+        * The reason to choose 85% as the threshold is because this is the
+        * imbalance_pct(117) when a LLC sched group is overloaded.
+        *
+        * let y = SCHED_CAPACITY_SCALE - p * x^2                       [1]
+        * and y'= y / SCHED_CAPACITY_SCALE
+        *
+        * x is the ratio of sum_util compared to the CPU capacity:
+        * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE)
+        * y' is the ratio of CPUs to be scanned in the LLC domain,
+        * and the number of CPUs to scan is calculated by:
+        *
+        * nr_scan = llc_weight * y'                                    [2]
+        *
+        * When x hits the threshold of overloaded, AKA, when
+        * x = 100 / pct, y drops to 0. According to [1],
+        * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000
+        *
+        * Scale x by SCHED_CAPACITY_SCALE:
+        * x' = sum_util / llc_weight;                                  [3]
+        *
+        * and finally [1] becomes:
+        * y = SCHED_CAPACITY_SCALE -
+        *     x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE)            [4]
+        *
+        */
+       /* equation [3] */
+       x = sum_util;
+       do_div(x, llc_weight);
+
+       /* equation [4] */
+       pct = env->sd->imbalance_pct;
+       tmp = x * x * pct * pct;
+       do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
+       tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
+       y = SCHED_CAPACITY_SCALE - tmp;
+
+       /* equation [2] */
+       y *= llc_weight;
+       do_div(y, SCHED_CAPACITY_SCALE);
+       if ((int)y != sd_share->nr_idle_scan)
+               WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
+}
+
  /**
   * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
   * @env: The load balancing environment.
@@ -9164,6 +9261,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
         struct sched_group *sg = env->sd->groups;
         struct sg_lb_stats *local = &sds->local_stat;
         struct sg_lb_stats tmp_sgs;
+       unsigned long sum_util = 0;
         int sg_status = 0;
  
         do {
@@ -9196,6 +9294,7 @@ next_group:
                 sds->total_load += sgs->group_load;
                 sds->total_capacity += sgs->group_capacity;
  
+               sum_util += sgs->group_util;
                 sg = sg->next;
         } while (sg != env->sd->groups);
  
@@ -9221,6 +9320,8 @@ next_group:
                 WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
                 trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
         }
+
+       update_idle_cpu_scan(env, sum_util);
  }
  
  #define NUMA_IMBALANCE_MIN 2
@@ -9340,7 +9441,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                 /* Consider allowing a small imbalance between NUMA groups */
                 if (env->sd->flags & SD_NUMA) {
                         env->imbalance = adjust_numa_imbalance(env->imbalance,
-                               busiest->sum_nr_running, busiest->group_weight);
+                               local->sum_nr_running + 1, local->group_weight);
                 }
  
                 return;
@@ -11109,7 +11210,7 @@ static void task_fork_fair(struct task_struct *p)
                  * 'current' within the tree based on its new key value.
                  */
                 swap(curr->vruntime, se->vruntime);
-               resched_curr(rq);
+               resched_curr_lazy(rq);
         }
  
         se->vruntime -= cfs_rq->min_vruntime;
@@ -11136,7 +11237,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
          */
         if (task_current(rq, p)) {
                 if (p->prio > oldprio)
-                       resched_curr(rq);
+                       resched_curr_lazy(rq);
         } else
                 check_preempt_curr(rq, p, 0);
  }
@@ -11358,8 +11459,6 @@ void free_fair_sched_group(struct task_group *tg)
  {
         int i;
  
-       destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
-
         for_each_possible_cpu(i) {
                 if (tg->cfs_rq)
                         kfree(tg->cfs_rq[i]);
@@ -11436,6 +11535,8 @@ void unregister_fair_sched_group(struct task_group *tg)
         struct rq *rq;
         int cpu;
  
+       destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
         for_each_possible_cpu(cpu) {
                 if (tg->se[cpu])
                         remove_entity_load_avg(tg->se[cpu]);