Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[platform/adaptation/renesas_rcar/renesas_kernel.git] / kernel / sched / core.c
index d2e2e17..d833cc9 100644 (file)
@@ -693,8 +693,6 @@ int tg_nop(struct task_group *tg, void *data)
 }
 #endif
 
-void update_cpu_load(struct rq *this_rq);
-
 static void set_load_weight(struct task_struct *p)
 {
        int prio = p->static_prio - MAX_RT_PRIO;
@@ -2481,22 +2479,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
  * scheduler tick (TICK_NSEC). With tickless idle this will not be called
  * every tick. We fix it up based on jiffies.
  */
-void update_cpu_load(struct rq *this_rq)
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+                             unsigned long pending_updates)
 {
-       unsigned long this_load = this_rq->load.weight;
-       unsigned long curr_jiffies = jiffies;
-       unsigned long pending_updates;
        int i, scale;
 
        this_rq->nr_load_updates++;
 
-       /* Avoid repeated calls on same jiffy, when moving in and out of idle */
-       if (curr_jiffies == this_rq->last_load_update_tick)
-               return;
-
-       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-       this_rq->last_load_update_tick = curr_jiffies;
-
        /* Update our load: */
        this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
        for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2521,9 +2510,45 @@ void update_cpu_load(struct rq *this_rq)
        sched_avg_update(this_rq);
 }
 
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+void update_idle_cpu_load(struct rq *this_rq)
+{
+       unsigned long curr_jiffies = jiffies;
+       unsigned long load = this_rq->load.weight;
+       unsigned long pending_updates;
+
+       /*
+        * Bloody broken means of dealing with nohz, but better than nothing..
+        * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
+        * update and see 0 difference the one time and 2 the next, even though
+        * we ticked at roughtly the same rate.
+        *
+        * Hence we only use this from nohz_idle_balance() and skip this
+        * nonsense when called from the scheduler_tick() since that's
+        * guaranteed a stable rate.
+        */
+       if (load || curr_jiffies == this_rq->last_load_update_tick)
+               return;
+
+       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+       this_rq->last_load_update_tick = curr_jiffies;
+
+       __update_cpu_load(this_rq, load, pending_updates);
+}
+
+/*
+ * Called from scheduler_tick()
+ */
 static void update_cpu_load_active(struct rq *this_rq)
 {
-       update_cpu_load(this_rq);
+       /*
+        * See the mess in update_idle_cpu_load().
+        */
+       this_rq->last_load_update_tick = jiffies;
+       __update_cpu_load(this_rq, this_rq->load.weight, 1);
 
        calc_load_account_active(this_rq);
 }
@@ -3108,6 +3133,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
        if (irqs_disabled())
                print_irqtrace_events(prev);
        dump_stack();
+       add_taint(TAINT_WARN);
 }
 
 /*
@@ -5555,7 +5581,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                        break;
                }
 
-               if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
+               if (!(sd->flags & SD_OVERLAP) &&
+                   cpumask_intersects(groupmask, sched_group_cpus(group))) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: repeated CPUs\n");
                        break;
@@ -5893,99 +5920,11 @@ static int __init isolated_cpu_setup(char *str)
 
 __setup("isolcpus=", isolated_cpu_setup);
 
-#ifdef CONFIG_NUMA
-
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain. Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, nodemask_t *used_nodes)
-{
-       int i, n, val, min_val, best_node = -1;
-
-       min_val = INT_MAX;
-
-       for (i = 0; i < nr_node_ids; i++) {
-               /* Start at @node */
-               n = (node + i) % nr_node_ids;
-
-               if (!nr_cpus_node(n))
-                       continue;
-
-               /* Skip already used nodes */
-               if (node_isset(n, *used_nodes))
-                       continue;
-
-               /* Simple min distance search */
-               val = node_distance(node, n);
-
-               if (val < min_val) {
-                       min_val = val;
-                       best_node = n;
-               }
-       }
-
-       if (best_node != -1)
-               node_set(best_node, *used_nodes);
-       return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @span: resulting cpumask
- *
- * Given a node, construct a good cpumask for its sched_domain to span. It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static void sched_domain_node_span(int node, struct cpumask *span)
-{
-       nodemask_t used_nodes;
-       int i;
-
-       cpumask_clear(span);
-       nodes_clear(used_nodes);
-
-       cpumask_or(span, span, cpumask_of_node(node));
-       node_set(node, used_nodes);
-
-       for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
-               int next_node = find_next_best_node(node, &used_nodes);
-               if (next_node < 0)
-                       break;
-               cpumask_or(span, span, cpumask_of_node(next_node));
-       }
-}
-
-static const struct cpumask *cpu_node_mask(int cpu)
-{
-       lockdep_assert_held(&sched_domains_mutex);
-
-       sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
-
-       return sched_domains_tmpmask;
-}
-
-static const struct cpumask *cpu_allnodes_mask(int cpu)
-{
-       return cpu_possible_mask;
-}
-#endif /* CONFIG_NUMA */
-
 static const struct cpumask *cpu_cpu_mask(int cpu)
 {
        return cpumask_of_node(cpu_to_node(cpu));
 }
 
-int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
-
 struct sd_data {
        struct sched_domain **__percpu sd;
        struct sched_group **__percpu sg;
@@ -6015,6 +5954,7 @@ struct sched_domain_topology_level {
        sched_domain_init_f init;
        sched_domain_mask_f mask;
        int                 flags;
+       int                 numa_level;
        struct sd_data      data;
 };
 
@@ -6206,10 +6146,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu)  \
 }
 
 SD_INIT_FUNC(CPU)
-#ifdef CONFIG_NUMA
- SD_INIT_FUNC(ALLNODES)
- SD_INIT_FUNC(NODE)
-#endif
 #ifdef CONFIG_SCHED_SMT
  SD_INIT_FUNC(SIBLING)
 #endif
@@ -6331,15 +6267,184 @@ static struct sched_domain_topology_level default_topology[] = {
        { sd_init_BOOK, cpu_book_mask, },
 #endif
        { sd_init_CPU, cpu_cpu_mask, },
-#ifdef CONFIG_NUMA
-       { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
-       { sd_init_ALLNODES, cpu_allnodes_mask, },
-#endif
        { NULL, },
 };
 
 static struct sched_domain_topology_level *sched_domain_topology = default_topology;
 
+#ifdef CONFIG_NUMA
+
+static int sched_domains_numa_levels;
+static int sched_domains_numa_scale;
+static int *sched_domains_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+
+static inline int sd_local_flags(int level)
+{
+       if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
+               return 0;
+
+       return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
+}
+
+static struct sched_domain *
+sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+{
+       struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+       int level = tl->numa_level;
+       int sd_weight = cpumask_weight(
+                       sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+
+       *sd = (struct sched_domain){
+               .min_interval           = sd_weight,
+               .max_interval           = 2*sd_weight,
+               .busy_factor            = 32,
+               .imbalance_pct          = 125,
+               .cache_nice_tries       = 2,
+               .busy_idx               = 3,
+               .idle_idx               = 2,
+               .newidle_idx            = 0,
+               .wake_idx               = 0,
+               .forkexec_idx           = 0,
+
+               .flags                  = 1*SD_LOAD_BALANCE
+                                       | 1*SD_BALANCE_NEWIDLE
+                                       | 0*SD_BALANCE_EXEC
+                                       | 0*SD_BALANCE_FORK
+                                       | 0*SD_BALANCE_WAKE
+                                       | 0*SD_WAKE_AFFINE
+                                       | 0*SD_PREFER_LOCAL
+                                       | 0*SD_SHARE_CPUPOWER
+                                       | 0*SD_SHARE_PKG_RESOURCES
+                                       | 1*SD_SERIALIZE
+                                       | 0*SD_PREFER_SIBLING
+                                       | sd_local_flags(level)
+                                       ,
+               .last_balance           = jiffies,
+               .balance_interval       = sd_weight,
+       };
+       SD_INIT_NAME(sd, NUMA);
+       sd->private = &tl->data;
+
+       /*
+        * Ugly hack to pass state to sd_numa_mask()...
+        */
+       sched_domains_curr_level = tl->numa_level;
+
+       return sd;
+}
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+       return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+
+static void sched_init_numa(void)
+{
+       int next_distance, curr_distance = node_distance(0, 0);
+       struct sched_domain_topology_level *tl;
+       int level = 0;
+       int i, j, k;
+
+       sched_domains_numa_scale = curr_distance;
+       sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+       if (!sched_domains_numa_distance)
+               return;
+
+       /*
+        * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+        * unique distances in the node_distance() table.
+        *
+        * Assumes node_distance(0,j) includes all distances in
+        * node_distance(i,j) in order to avoid cubic time.
+        *
+        * XXX: could be optimized to O(n log n) by using sort()
+        */
+       next_distance = curr_distance;
+       for (i = 0; i < nr_node_ids; i++) {
+               for (j = 0; j < nr_node_ids; j++) {
+                       int distance = node_distance(0, j);
+                       if (distance > curr_distance &&
+                                       (distance < next_distance ||
+                                        next_distance == curr_distance))
+                               next_distance = distance;
+               }
+               if (next_distance != curr_distance) {
+                       sched_domains_numa_distance[level++] = next_distance;
+                       sched_domains_numa_levels = level;
+                       curr_distance = next_distance;
+               } else break;
+       }
+       /*
+        * 'level' contains the number of unique distances, excluding the
+        * identity distance node_distance(i,i).
+        *
+        * The sched_domains_nume_distance[] array includes the actual distance
+        * numbers.
+        */
+
+       sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+       if (!sched_domains_numa_masks)
+               return;
+
+       /*
+        * Now for each level, construct a mask per node which contains all
+        * cpus of nodes that are that many hops away from us.
+        */
+       for (i = 0; i < level; i++) {
+               sched_domains_numa_masks[i] =
+                       kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+               if (!sched_domains_numa_masks[i])
+                       return;
+
+               for (j = 0; j < nr_node_ids; j++) {
+                       struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
+                       if (!mask)
+                               return;
+
+                       sched_domains_numa_masks[i][j] = mask;
+
+                       for (k = 0; k < nr_node_ids; k++) {
+                               if (node_distance(j, k) > sched_domains_numa_distance[i])
+                                       continue;
+
+                               cpumask_or(mask, mask, cpumask_of_node(k));
+                       }
+               }
+       }
+
+       tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+                       sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+       if (!tl)
+               return;
+
+       /*
+        * Copy the default topology bits..
+        */
+       for (i = 0; default_topology[i].init; i++)
+               tl[i] = default_topology[i];
+
+       /*
+        * .. and append 'j' levels of NUMA goodness.
+        */
+       for (j = 0; j < level; i++, j++) {
+               tl[i] = (struct sched_domain_topology_level){
+                       .init = sd_numa_init,
+                       .mask = sd_numa_mask,
+                       .flags = SDTL_OVERLAP,
+                       .numa_level = j,
+               };
+       }
+
+       sched_domain_topology = tl;
+}
+#else
+static inline void sched_init_numa(void)
+{
+}
+#endif /* CONFIG_NUMA */
+
 static int __sdt_alloc(const struct cpumask *cpu_map)
 {
        struct sched_domain_topology_level *tl;
@@ -6707,97 +6812,6 @@ match2:
        mutex_unlock(&sched_domains_mutex);
 }
 
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static void reinit_sched_domains(void)
-{
-       get_online_cpus();
-
-       /* Destroy domains first to force the rebuild */
-       partition_sched_domains(0, NULL, NULL);
-
-       rebuild_sched_domains();
-       put_online_cpus();
-}
-
-static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
-{
-       unsigned int level = 0;
-
-       if (sscanf(buf, "%u", &level) != 1)
-               return -EINVAL;
-
-       /*
-        * level is always be positive so don't check for
-        * level < POWERSAVINGS_BALANCE_NONE which is 0
-        * What happens on 0 or 1 byte write,
-        * need to check for count as well?
-        */
-
-       if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
-               return -EINVAL;
-
-       if (smt)
-               sched_smt_power_savings = level;
-       else
-               sched_mc_power_savings = level;
-
-       reinit_sched_domains();
-
-       return count;
-}
-
-#ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct device *dev,
-                                          struct device_attribute *attr,
-                                          char *buf)
-{
-       return sprintf(buf, "%u\n", sched_mc_power_savings);
-}
-static ssize_t sched_mc_power_savings_store(struct device *dev,
-                                           struct device_attribute *attr,
-                                           const char *buf, size_t count)
-{
-       return sched_power_savings_store(buf, count, 0);
-}
-static DEVICE_ATTR(sched_mc_power_savings, 0644,
-                  sched_mc_power_savings_show,
-                  sched_mc_power_savings_store);
-#endif
-
-#ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct device *dev,
-                                           struct device_attribute *attr,
-                                           char *buf)
-{
-       return sprintf(buf, "%u\n", sched_smt_power_savings);
-}
-static ssize_t sched_smt_power_savings_store(struct device *dev,
-                                           struct device_attribute *attr,
-                                            const char *buf, size_t count)
-{
-       return sched_power_savings_store(buf, count, 1);
-}
-static DEVICE_ATTR(sched_smt_power_savings, 0644,
-                  sched_smt_power_savings_show,
-                  sched_smt_power_savings_store);
-#endif
-
-int __init sched_create_sysfs_power_savings_entries(struct device *dev)
-{
-       int err = 0;
-
-#ifdef CONFIG_SCHED_SMT
-       if (smt_capable())
-               err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
-#endif
-#ifdef CONFIG_SCHED_MC
-       if (!err && mc_capable())
-               err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
-#endif
-       return err;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-
 /*
  * Update cpusets according to cpu_active mask.  If cpusets are
  * disabled, cpuset_update_active_cpus() becomes a simple wrapper
@@ -6835,6 +6849,8 @@ void __init sched_init_smp(void)
        alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
        alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 
+       sched_init_numa();
+
        get_online_cpus();
        mutex_lock(&sched_domains_mutex);
        init_sched_domains(cpu_active_mask);