[PATCH] sched: cleanup sched_group cpu_power setup

author Siddha, Suresh B <suresh.b.siddha@intel.com>

Tue, 3 Oct 2006 08:14:09 +0000 (01:14 -0700)

committer Linus Torvalds <torvalds@g5.osdl.org>

Tue, 3 Oct 2006 15:04:06 +0000 (08:04 -0700)
author Siddha, Suresh B <suresh.b.siddha@intel.com>
Tue, 3 Oct 2006 08:14:09 +0000 (01:14 -0700)
committer Linus Torvalds <torvalds@g5.osdl.org>
Tue, 3 Oct 2006 15:04:06 +0000 (08:04 -0700)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 8e26c90..331f450 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -625,9 +625,17 @@ enum idle_type
  #define SD_WAKE_BALANCE                64      /* Perform balancing at task wakeup */
  #define SD_SHARE_CPUPOWER      128     /* Domain members share cpu power */
  #define SD_POWERSAVINGS_BALANCE        256     /* Balance for power savings */
+#define SD_SHARE_PKG_RESOURCES 512     /* Domain members share cpu pkg resources */
  
-#define BALANCE_FOR_POWER      ((sched_mc_power_savings || sched_smt_power_savings) \
-                                ? SD_POWERSAVINGS_BALANCE : 0)
+#define BALANCE_FOR_MC_POWER   \
+       (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
+
+#define BALANCE_FOR_PKG_POWER  \
+       ((sched_mc_power_savings || sched_smt_power_savings) ?  \
+        SD_POWERSAVINGS_BALANCE : 0)
+
+#define test_sd_parent(sd, flag)       ((sd->parent &&         \
+                                        (sd->parent->flags & flag)) ? 1 : 0)
  
  
  struct sched_group {
diff --git a/include/linux/topology.h b/include/linux/topology.h

index 486bec2..da508d1 100644 (file)
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -115,6 +115,38 @@
  #endif
  #endif /* CONFIG_SCHED_SMT */
  
+#ifdef CONFIG_SCHED_MC
+/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */
+#ifndef SD_MC_INIT
+#define SD_MC_INIT (struct sched_domain) {             \
+       .span                   = CPU_MASK_NONE,        \
+       .parent                 = NULL,                 \
+       .child                  = NULL,                 \
+       .groups                 = NULL,                 \
+       .min_interval           = 1,                    \
+       .max_interval           = 4,                    \
+       .busy_factor            = 64,                   \
+       .imbalance_pct          = 125,                  \
+       .cache_nice_tries       = 1,                    \
+       .per_cpu_gain           = 100,                  \
+       .busy_idx               = 2,                    \
+       .idle_idx               = 1,                    \
+       .newidle_idx            = 2,                    \
+       .wake_idx               = 1,                    \
+       .forkexec_idx           = 1,                    \
+       .flags                  = SD_LOAD_BALANCE       \
+                               | SD_BALANCE_NEWIDLE    \
+                               | SD_BALANCE_EXEC       \
+                               | SD_WAKE_AFFINE        \
+                               | SD_SHARE_PKG_RESOURCES\
+                               | BALANCE_FOR_MC_POWER, \
+       .last_balance           = jiffies,              \
+       .balance_interval       = 1,                    \
+       .nr_balance_failed      = 0,                    \
+}
+#endif
+#endif /* CONFIG_SCHED_MC */
+
  /* Common values for CPUs */
  #ifndef SD_CPU_INIT
  #define SD_CPU_INIT (struct sched_domain) {            \
@@ -137,7 +169,7 @@
                                 | SD_BALANCE_NEWIDLE    \
                                 | SD_BALANCE_EXEC       \
                                 | SD_WAKE_AFFINE        \
-                               | BALANCE_FOR_POWER,    \
+                               | BALANCE_FOR_PKG_POWER,\
         .last_balance           = jiffies,              \
         .balance_interval       = 1,                    \
         .nr_balance_failed      = 0,                    \
@@ -168,15 +200,6 @@
         .nr_balance_failed      = 0,                    \
  }
  
-#ifdef CONFIG_SCHED_MC
-#ifndef SD_MC_INIT
-/* for now its same as SD_CPU_INIT.
- * TBD: Tune Domain parameters!
- */
-#define SD_MC_INIT   SD_CPU_INIT
-#endif
-#endif
-
  #ifdef CONFIG_NUMA
  #ifndef SD_NODE_INIT
  #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/kernel/sched.c b/kernel/sched.c

index 0feeacb..0a5e814 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2541,8 +2541,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
         struct rq *busiest;
         cpumask_t cpus = CPU_MASK_ALL;
  
+       /*
+        * When power savings policy is enabled for the parent domain, idle
+        * sibling can pick up load irrespective of busy siblings. In this case,
+        * let the state of idle sibling percolate up as IDLE, instead of
+        * portraying it as NOT_IDLE.
+        */
         if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
-           !sched_smt_power_savings)
+           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                 sd_idle = 1;
  
         schedstat_inc(sd, lb_cnt[idle]);
@@ -2638,7 +2644,7 @@ redo:
         }
  
         if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-           !sched_smt_power_savings)
+           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                 return -1;
         return nr_moved;
  
@@ -2654,7 +2660,7 @@ out_one_pinned:
                 sd->balance_interval *= 2;
  
         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-                       !sched_smt_power_savings)
+           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                 return -1;
         return 0;
  }
@@ -2676,7 +2682,14 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
         int sd_idle = 0;
         cpumask_t cpus = CPU_MASK_ALL;
  
-       if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
+       /*
+        * When power savings policy is enabled for the parent domain, idle
+        * sibling can pick up load irrespective of busy siblings. In this case,
+        * let the state of idle sibling percolate up as IDLE, instead of
+        * portraying it as NOT_IDLE.
+        */
+       if (sd->flags & SD_SHARE_CPUPOWER &&
+           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                 sd_idle = 1;
  
         schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2717,7 +2730,8 @@ redo:
  
         if (!nr_moved) {
                 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
-               if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+               if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+                   !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                         return -1;
         } else
                 sd->nr_balance_failed = 0;
@@ -2727,7 +2741,7 @@ redo:
  out_balanced:
         schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-                                       !sched_smt_power_savings)
+           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                 return -1;
         sd->nr_balance_failed = 0;
  
@@ -5400,7 +5414,9 @@ static int sd_degenerate(struct sched_domain *sd)
         if (sd->flags & (SD_LOAD_BALANCE |
                          SD_BALANCE_NEWIDLE |
                          SD_BALANCE_FORK |
-                        SD_BALANCE_EXEC)) {
+                        SD_BALANCE_EXEC |
+                        SD_SHARE_CPUPOWER |
+                        SD_SHARE_PKG_RESOURCES)) {
                 if (sd->groups != sd->groups->next)
                         return 0;
         }
@@ -5434,7 +5450,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                 pflags &= ~(SD_LOAD_BALANCE |
                                 SD_BALANCE_NEWIDLE |
                                 SD_BALANCE_FORK |
-                               SD_BALANCE_EXEC);
+                               SD_BALANCE_EXEC |
+                               SD_SHARE_CPUPOWER |
+                               SD_SHARE_PKG_RESOURCES);
         }
         if (~cflags & pflags)
                 return 0;
@@ -6241,12 +6259,65 @@ static void free_sched_groups(const cpumask_t *cpu_map)
  #endif
  
  /*
+ * Initialize sched groups cpu_power.
+ *
+ * cpu_power indicates the capacity of sched group, which is used while
+ * distributing the load between different sched groups in a sched domain.
+ * Typically cpu_power for all the groups in a sched domain will be same unless
+ * there are asymmetries in the topology. If there are asymmetries, group
+ * having more cpu_power will pickup more load compared to the group having
+ * less cpu_power.
+ *
+ * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
+ * the maximum number of tasks a group can handle in the presence of other idle
+ * or lightly loaded groups in the same sched domain.
+ */
+static void init_sched_groups_power(int cpu, struct sched_domain *sd)
+{
+       struct sched_domain *child;
+       struct sched_group *group;
+
+       WARN_ON(!sd || !sd->groups);
+
+       if (cpu != first_cpu(sd->groups->cpumask))
+               return;
+
+       child = sd->child;
+
+       /*
+        * For perf policy, if the groups in child domain share resources
+        * (for example cores sharing some portions of the cache hierarchy
+        * or SMT), then set this domain groups cpu_power such that each group
+        * can handle only one task, when there are other idle groups in the
+        * same sched domain.
+        */
+       if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
+                      (child->flags &
+                       (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
+               sd->groups->cpu_power = SCHED_LOAD_SCALE;
+               return;
+       }
+
+       sd->groups->cpu_power = 0;
+
+       /*
+        * add cpu_power of each child group to this groups cpu_power
+        */
+       group = child->groups;
+       do {
+               sd->groups->cpu_power += group->cpu_power;
+               group = group->next;
+       } while (group != child->groups);
+}
+
+/*
   * Build sched domains for a given set of cpus and attach the sched domains
   * to the individual cpus
   */
  static int build_sched_domains(const cpumask_t *cpu_map)
  {
         int i;
+       struct sched_domain *sd;
  #ifdef CONFIG_NUMA
         struct sched_group **sched_group_nodes = NULL;
         struct sched_group *sched_group_allnodes = NULL;
@@ -6456,72 +6527,20 @@ static int build_sched_domains(const cpumask_t *cpu_map)
         /* Calculate CPU power for physical packages and nodes */
  #ifdef CONFIG_SCHED_SMT
         for_each_cpu_mask(i, *cpu_map) {
-               struct sched_domain *sd;
                 sd = &per_cpu(cpu_domains, i);
-               sd->groups->cpu_power = SCHED_LOAD_SCALE;
+               init_sched_groups_power(i, sd);
         }
  #endif
  #ifdef CONFIG_SCHED_MC
         for_each_cpu_mask(i, *cpu_map) {
-               int power;
-               struct sched_domain *sd;
                 sd = &per_cpu(core_domains, i);
-               if (sched_smt_power_savings)
-                       power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
-               else
-                       power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
-                                           * SCHED_LOAD_SCALE / 10;
-               sd->groups->cpu_power = power;
+               init_sched_groups_power(i, sd);
         }
  #endif
  
         for_each_cpu_mask(i, *cpu_map) {
-               struct sched_domain *sd;
-#ifdef CONFIG_SCHED_MC
-               sd = &per_cpu(phys_domains, i);
-               if (i != first_cpu(sd->groups->cpumask))
-                       continue;
-
-               sd->groups->cpu_power = 0;
-               if (sched_mc_power_savings || sched_smt_power_savings) {
-                       int j;
-
-                       for_each_cpu_mask(j, sd->groups->cpumask) {
-                               struct sched_domain *sd1;
-                               sd1 = &per_cpu(core_domains, j);
-                               /*
-                                * for each core we will add once
-                                * to the group in physical domain
-                                */
-                               if (j != first_cpu(sd1->groups->cpumask))
-                                       continue;
-
-                               if (sched_smt_power_savings)
-                                       sd->groups->cpu_power += sd1->groups->cpu_power;
-                               else
-                                       sd->groups->cpu_power += SCHED_LOAD_SCALE;
-                       }
-               } else
-                       /*
-                        * This has to be < 2 * SCHED_LOAD_SCALE
-                        * Lets keep it SCHED_LOAD_SCALE, so that
-                        * while calculating NUMA group's cpu_power
-                        * we can simply do
-                        *  numa_group->cpu_power += phys_group->cpu_power;
-                        *
-                        * See "only add power once for each physical pkg"
-                        * comment below
-                        */
-                       sd->groups->cpu_power = SCHED_LOAD_SCALE;
-#else
-               int power;
                 sd = &per_cpu(phys_domains, i);
-               if (sched_smt_power_savings)
-                       power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
-               else
-                       power = SCHED_LOAD_SCALE;
-               sd->groups->cpu_power = power;
-#endif
+               init_sched_groups_power(i, sd);
         }
  
  #ifdef CONFIG_NUMA
author	Siddha, Suresh B <suresh.b.siddha@intel.com>
	Tue, 3 Oct 2006 08:14:09 +0000 (01:14 -0700)
committer	Linus Torvalds <torvalds@g5.osdl.org>
	Tue, 3 Oct 2006 15:04:06 +0000 (08:04 -0700)
include/linux/sched.h		patch \| blob \| history
include/linux/topology.h		patch \| blob \| history
kernel/sched.c		patch \| blob \| history