sched/fair: Rewrite group_imb trigger

author Peter Zijlstra <peterz@infradead.org>

Mon, 19 Aug 2013 10:41:09 +0000 (12:41 +0200)

committer Ingo Molnar <mingo@kernel.org>

Thu, 12 Sep 2013 17:14:42 +0000 (19:14 +0200)
author Peter Zijlstra <peterz@infradead.org>
Mon, 19 Aug 2013 10:41:09 +0000 (12:41 +0200)
committer Ingo Molnar <mingo@kernel.org>
Thu, 12 Sep 2013 17:14:42 +0000 (19:14 +0200)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 11cd136..7325ca7 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3906,7 +3906,8 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
  
  #define LBF_ALL_PINNED 0x01
  #define LBF_NEED_BREAK 0x02
-#define LBF_SOME_PINNED 0x04
+#define LBF_DST_PINNED  0x04
+#define LBF_SOME_PINNED        0x08
  
  struct lb_env {
         struct sched_domain     *sd;
@@ -3997,6 +3998,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
  
                 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
  
+               env->flags |= LBF_SOME_PINNED;
+
                 /*
                  * Remember if this task can be migrated to any other cpu in
                  * our sched_group. We may want to revisit it if we couldn't
@@ -4005,13 +4008,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
                  * Also avoid computing new_dst_cpu if we have already computed
                  * one in current iteration.
                  */
-               if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
+               if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
                         return 0;
  
                 /* Prevent to re-select dst_cpu via env's cpus */
                 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
                         if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
-                               env->flags |= LBF_SOME_PINNED;
+                               env->flags |= LBF_DST_PINNED;
                                 env->new_dst_cpu = cpu;
                                 break;
                         }
@@ -4526,13 +4529,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
   * cpu 3 and leave one of the cpus in the second group unused.
   *
   * The current solution to this issue is detecting the skew in the first group
- * by noticing it has a cpu that is overloaded while the remaining cpus are
- * idle -- or rather, there's a distinct imbalance in the cpus; see
- * sg_imbalanced().
+ * by noticing the lower domain failed to reach balance and had difficulty
+ * moving tasks due to affinity constraints.
   *
   * When this is so detected; this group becomes a candidate for busiest; see
   * update_sd_pick_busiest(). And calculcate_imbalance() and
- * find_busiest_group() avoid some of the usual balance conditional to allow it
+ * find_busiest_group() avoid some of the usual balance conditions to allow it
   * to create an effective group imbalance.
   *
   * This is a somewhat tricky proposition since the next run might not find the
@@ -4540,49 +4542,9 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
   * subtle and fragile situation.
   */
  
-struct sg_imb_stats {
-       unsigned long max_nr_running, min_nr_running;
-       unsigned long max_cpu_load, min_cpu_load;
-};
-
-static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
-{
-       sgi->max_cpu_load = sgi->max_nr_running = 0UL;
-       sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
-}
-
-static inline void
-update_sg_imb_stats(struct sg_imb_stats *sgi,
-                   unsigned long load, unsigned long nr_running)
-{
-       if (load > sgi->max_cpu_load)
-               sgi->max_cpu_load = load;
-       if (sgi->min_cpu_load > load)
-               sgi->min_cpu_load = load;
-
-       if (nr_running > sgi->max_nr_running)
-               sgi->max_nr_running = nr_running;
-       if (sgi->min_nr_running > nr_running)
-               sgi->min_nr_running = nr_running;
-}
-
-static inline int
-sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi)
+static inline int sg_imbalanced(struct sched_group *group)
  {
-       /*
-        * Consider the group unbalanced when the imbalance is larger
-        * than the average weight of a task.
-        *
-        * APZ: with cgroup the avg task weight can vary wildly and
-        *      might not be a suitable number - should we keep a
-        *      normalized nr_running number somewhere that negates
-        *      the hierarchy?
-        */
-       if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
-           (sgi->max_nr_running - sgi->min_nr_running) > 1)
-               return 1;
-
-       return 0;
+       return group->sgp->imbalance;
  }
  
  /**
@@ -4597,25 +4559,20 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                         struct sched_group *group, int load_idx,
                         int local_group, struct sg_lb_stats *sgs)
  {
-       struct sg_imb_stats sgi;
         unsigned long nr_running;
         unsigned long load;
         int i;
  
-       init_sg_imb_stats(&sgi);
-
         for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
                 struct rq *rq = cpu_rq(i);
  
                 nr_running = rq->nr_running;
  
                 /* Bias balancing toward cpus of our domain */
-               if (local_group) {
+               if (local_group)
                         load = target_load(i, load_idx);
-               } else {
+               else
                         load = source_load(i, load_idx);
-                       update_sg_imb_stats(&sgi, load, nr_running);
-               }
  
                 sgs->group_load += load;
                 sgs->sum_nr_running += nr_running;
@@ -4635,7 +4592,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
         if (sgs->sum_nr_running)
                 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
  
-       sgs->group_imb = sg_imbalanced(sgs, &sgi);
+       sgs->group_imb = sg_imbalanced(group);
  
         sgs->group_capacity =
                 DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
@@ -5163,6 +5120,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                         int *continue_balancing)
  {
         int ld_moved, cur_ld_moved, active_balance = 0;
+       struct sched_domain *sd_parent = sd->parent;
         struct sched_group *group;
         struct rq *busiest;
         unsigned long flags;
@@ -5267,11 +5225,11 @@ more_balance:
                  * moreover subsequent load balance cycles should correct the
                  * excess load moved.
                  */
-               if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+               if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
  
                         env.dst_rq       = cpu_rq(env.new_dst_cpu);
                         env.dst_cpu      = env.new_dst_cpu;
-                       env.flags       &= ~LBF_SOME_PINNED;
+                       env.flags       &= ~LBF_DST_PINNED;
                         env.loop         = 0;
                         env.loop_break   = sched_nr_migrate_break;
  
@@ -5285,6 +5243,18 @@ more_balance:
                         goto more_balance;
                 }
  
+               /*
+                * We failed to reach balance because of affinity.
+                */
+               if (sd_parent) {
+                       int *group_imbalance = &sd_parent->groups->sgp->imbalance;
+
+                       if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+                               *group_imbalance = 1;
+                       } else if (*group_imbalance)
+                               *group_imbalance = 0;
+               }
+
                 /* All tasks on this runqueue were pinned by CPU affinity */
                 if (unlikely(env.flags & LBF_ALL_PINNED)) {
                         cpumask_clear_cpu(cpu_of(busiest), cpus);
@@ -5688,7 +5658,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
                         if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
                                 /*
-                                * The LBF_SOME_PINNED logic could have changed
+                                * The LBF_DST_PINNED logic could have changed
                                  * env->dst_cpu, so we can't know our idle
                                  * state even if we migrated tasks. Update it.
                                  */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index b3c5653..0d7544c 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -605,6 +605,7 @@ struct sched_group_power {
          */
         unsigned int power, power_orig;
         unsigned long next_update;
+       int imbalance; /* XXX unrelated to power but shared group state */
         /*
          * Number of busy cpus in this group.
          */
author	Peter Zijlstra <peterz@infradead.org>
	Mon, 19 Aug 2013 10:41:09 +0000 (12:41 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Thu, 12 Sep 2013 17:14:42 +0000 (19:14 +0200)
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history