Per Sched domain over utilization

author Thara Gopinath <thara.gopinath@linaro.org>

Fri, 23 Jun 2017 14:37:05 +0000 (10:37 -0400)

committer Lukasz Luba <l.luba@partner.samsung.com>

Mon, 10 Sep 2018 08:21:08 +0000 (10:21 +0200)
author Thara Gopinath <thara.gopinath@linaro.org>
Fri, 23 Jun 2017 14:37:05 +0000 (10:37 -0400)
committer Lukasz Luba <l.luba@partner.samsung.com>
Mon, 10 Sep 2018 08:21:08 +0000 (10:21 +0200)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h

index 4b5f3c3cbb369eeb53b5e98cd6e8afc28ee41d9f..e0161c3da0da6c002920aee12dbe33fa7feea76b 100644 (file)
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -89,6 +89,8 @@ struct sched_domain_shared {
         atomic_t        ref;
         atomic_t        nr_busy_cpus;
         int             has_idle_cores;
+
+       bool            overutilized;
  };
  
  struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index fdfb2e51c815cd0eaa6e0bf448ad3b9e379e4009..d56d33dd94b5cbb1f45073edf0502d84d128698a 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5155,6 +5155,21 @@ static inline void hrtick_update(struct rq *rq)
  
  static bool cpu_overutilized(int cpu);
  
+static bool sd_overutilized(struct sched_domain *sd)
+{
+       return sd->shared->overutilized;
+}
+
+static void set_sd_overutilized(struct sched_domain *sd)
+{
+       sd->shared->overutilized = true;
+}
+
+static void clear_sd_overutilized(struct sched_domain *sd)
+{
+       sd->shared->overutilized = false;
+}
+
  /*
   * The enqueue_task method is called before nr_running is
   * increased. Here we update the fair scheduling stats and
@@ -5164,6 +5179,7 @@ static void
  enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  {
         struct cfs_rq *cfs_rq;
+       struct sched_domain *sd;
         struct sched_entity *se = &p->se;
         int task_new = !(flags & ENQUEUE_WAKEUP);
  
@@ -5207,9 +5223,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  
         if (!se) {
                 add_nr_running(rq, 1);
-               if (!task_new && !rq->rd->overutilized &&
+               rcu_read_lock();
+               sd = rcu_dereference(rq->sd);
+               if (!task_new && sd && !sd_overutilized(sd) &&
                     cpu_overutilized(rq->cpu))
-                       rq->rd->overutilized = true;
+                       set_sd_overutilized(sd);
+               rcu_read_unlock();
         }
         hrtick_update(rq);
  }
@@ -6564,8 +6583,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu)
         unsigned long max_spare = 0;
         struct sched_domain *sd;
  
-       rcu_read_lock();
-
         sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
  
         if (!sd)
@@ -6603,8 +6620,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu)
         }
  
  unlock:
-       rcu_read_unlock();
-
         if (energy_cpu == prev_cpu && !cpu_overutilized(prev_cpu))
                 return prev_cpu;
  
@@ -6638,10 +6653,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                               && cpumask_test_cpu(cpu, &p->cpus_allowed);
         }
  
-       if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
-               return select_energy_cpu_brute(p, prev_cpu);
-
         rcu_read_lock();
+       sd = rcu_dereference(cpu_rq(prev_cpu)->sd);
+       if (energy_aware() && sd && !sd_overutilized(sd)) {
+               new_cpu = select_energy_cpu_brute(p, prev_cpu);
+               goto unlock;
+       }
+
+       sd = NULL;
+
         for_each_domain(cpu, tmp) {
                 if (!(tmp->flags & SD_LOAD_BALANCE))
                         break;
@@ -6688,6 +6708,8 @@ pick_cpu:
         } else {
                 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
         }
+
+unlock:
         rcu_read_unlock();
  
  #ifdef CONFIG_NO_HZ_COMMON
@@ -7816,6 +7838,7 @@ struct sd_lb_stats {
         unsigned long total_running;
         unsigned long total_load;       /* Total load of all groups in sd */
         unsigned long total_capacity;   /* Total capacity of all groups in sd */
+       unsigned long total_util;       /* Total util of all groups in sd */
         unsigned long avg_load; /* Average load across all groups in sd */
  
         struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -7836,6 +7859,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
                 .total_running = 0UL,
                 .total_load = 0UL,
                 .total_capacity = 0UL,
+               .total_util = 0UL,
                 .busiest_stat = {
                         .avg_load = 0UL,
                         .sum_nr_running = 0,
@@ -8131,7 +8155,7 @@ group_type group_classify(struct sched_group *group,
  static inline void update_sg_lb_stats(struct lb_env *env,
                         struct sched_group *group, int load_idx,
                         int local_group, struct sg_lb_stats *sgs,
-                       bool *overload, bool *overutilized)
+                       bool *overload, bool *overutilized, bool *misfit_task)
  {
         unsigned long load;
         int i, nr_running;
@@ -8170,8 +8194,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                     !sgs->group_misfit_task && rq->misfit_task)
                         sgs->group_misfit_task = capacity_of(i);
  
-               if (cpu_overutilized(i))
+               if (cpu_overutilized(i)) {
                         *overutilized = true;
+
+                       if (rq->misfit_task)
+                               *misfit_task = true;
+               }
         }
  
         /* Adjust by relative CPU capacity of the group */
@@ -8328,7 +8356,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
         struct sg_lb_stats *local = &sds->local_stat;
         struct sg_lb_stats tmp_sgs;
         int load_idx, prefer_sibling = 0;
-       bool overload = false, overutilized = false;
+       bool overload = false, overutilized = false, misfit_task = false;
  
         if (child && child->flags & SD_PREFER_SIBLING)
                 prefer_sibling = 1;
@@ -8373,7 +8401,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                 }
  
                 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
-                                               &overload, &overutilized);
+                                               &overload, &overutilized,
+                                               &misfit_task);
  
                 if (local_group)
                         goto next_group;
@@ -8405,6 +8434,7 @@ next_group:
                 sds->total_running += sgs->sum_nr_running;
                 sds->total_load += sgs->group_load;
                 sds->total_capacity += sgs->group_capacity;
+               sds->total_util += sgs->group_util;
  
                 sg = sg->next;
         } while (sg != env->sd->groups);
@@ -8418,14 +8448,46 @@ next_group:
                 /* update overload indicator if we are at root domain */
                 if (env->dst_rq->rd->overload != overload)
                         env->dst_rq->rd->overload = overload;
+       }
  
-               /* Update over-utilization (tipping point, U >= 0) indicator */
-               if (env->dst_rq->rd->overutilized != overutilized)
-                       env->dst_rq->rd->overutilized = overutilized;
-       } else {
-               if (!env->dst_rq->rd->overutilized && overutilized)
-                       env->dst_rq->rd->overutilized = true;
+       if (overutilized)
+               set_sd_overutilized(env->sd);
+       else
+               clear_sd_overutilized(env->sd);
+
+       /*
+        * If there is a misfit task in one cpu in this sched_domain
+        * it is likely that the imbalance cannot be sorted out among
+        * the cpu's in this sched_domain. In this case set the
+        * overutilized flag at the parent sched_domain.
+        */
+       if (misfit_task) {
+               struct sched_domain *sd = env->sd->parent;
+
+               /*
+                * In case of a misfit task, load balance at the parent
+                * sched domain level will make sense only if the the cpus
+                * have a different capacity. If cpus at a domain level have
+                * the same capacity, the misfit task cannot be well
+                * accomodated  in any of the cpus and there in no point in
+                * trying a load balance at this level
+                */
+               while (sd) {
+                       if (sd->flags & SD_ASYM_CPUCAPACITY) {
+                               set_sd_overutilized(sd);
+                               break;
+                       }
+                       sd = sd->parent;
+               }
         }
+
+       /*
+        * If the domain util is greater that domain capacity, load balancing
+        * needs to be done at the next sched domain level as well.
+        */
+       if (lb_sd_parent(env->sd) &&
+           sds->total_capacity * 1024 < sds->total_util * capacity_margin)
+               set_sd_overutilized(env->sd->parent);
  }
  
  /**
@@ -8652,7 +8714,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
          */
         update_sd_lb_stats(env, &sds);
  
-       if (energy_aware() && !env->dst_rq->rd->overutilized)
+       if (energy_aware() && !sd_overutilized(env->sd))
                 goto out_balanced;
  
         local = &sds.local_stat;
@@ -9581,6 +9643,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                 }
                 max_cost += sd->max_newidle_lb_cost;
  
+               if (energy_aware() && !sd_overutilized(sd))
+                       continue;
+
                 if (!(sd->flags & SD_LOAD_BALANCE))
                         continue;
  
@@ -9910,6 +9975,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
  {
         struct cfs_rq *cfs_rq;
         struct sched_entity *se = &curr->se;
+       struct sched_domain *sd;
  
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
@@ -9921,8 +9987,11 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
  
         rq->misfit_task = !task_fits_capacity(curr, capacity_of(rq->cpu));
  
-       if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr)))
-               rq->rd->overutilized = true;
+       rcu_read_lock();
+       sd = rcu_dereference(rq->sd);
+       if (sd && !sd_overutilized(sd) && cpu_overutilized(task_cpu(curr)))
+               set_sd_overutilized(sd);
+       rcu_read_unlock();
  }
  
  /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 549cbda7bff910bb8737bc940a1ebe8bb2949e34..0dab02acac26d58c75608556d1b4202b7325abfe 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -627,9 +627,6 @@ struct root_domain {
         /* Indicate more than one runnable task for any CPU */
         bool overload;
  
-       /* Indicate one or more cpus over-utilized (tipping point) */
-       bool overutilized;
-
         /*
          * The bit corresponding to a CPU gets set here if such CPU has more
          * than one runnable -deadline task (as it is below for RT tasks).
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c

index ce82adec1dafd38bf9f656108b7033f9e6b9b016..d19d513c04030763ca52a259daea7b2ca5f2788a 100644 (file)
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1279,15 +1279,11 @@ sd_init(struct sched_domain_topology_level *tl,
                 sd->idle_idx = 1;
         }
  
-       /*
-        * For all levels sharing cache; connect a sched_domain_shared
-        * instance.
-        */
-       if (sd->flags & SD_SHARE_PKG_RESOURCES) {
-               sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
-               atomic_inc(&sd->shared->ref);
+       sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+       atomic_inc(&sd->shared->ref);
+
+       if (sd->flags & SD_SHARE_PKG_RESOURCES)
                 atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
-       }
  
         sd->private = sdd;
author	Thara Gopinath <thara.gopinath@linaro.org>
	Fri, 23 Jun 2017 14:37:05 +0000 (10:37 -0400)
committer	Lukasz Luba <l.luba@partner.samsung.com>
	Mon, 10 Sep 2018 08:21:08 +0000 (10:21 +0200)
include/linux/sched/topology.h		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history
kernel/sched/topology.c		patch \| blob \| history