sched/numa: Replace runnable_load_avg by load_avg
authorVincent Guittot <vincent.guittot@linaro.org>
Mon, 24 Feb 2020 09:52:15 +0000 (09:52 +0000)
committerIngo Molnar <mingo@kernel.org>
Mon, 24 Feb 2020 10:36:34 +0000 (11:36 +0100)
Similarly to what has been done for the normal load balancer, we can
replace runnable_load_avg by load_avg in numa load balancing and track the
other statistics like the utilization and the number of running tasks to
get to better view of the current state of a node.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: "Dietmar Eggemann <dietmar.eggemann@arm.com>"
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Hillf Danton <hdanton@sina.com>
Link: https://lore.kernel.org/r/20200224095223.13361-6-mgorman@techsingularity.net
kernel/sched/fair.c

index a6c7f8b..bc3d651 100644 (file)
@@ -1473,38 +1473,35 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
               group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
 }
 
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
-
-static unsigned long cpu_runnable_load(struct rq *rq)
-{
-       return cfs_rq_runnable_load_avg(&rq->cfs);
-}
+/*
+ * 'numa_type' describes the node at the moment of load balancing.
+ */
+enum numa_type {
+       /* The node has spare capacity that can be used to run more tasks.  */
+       node_has_spare = 0,
+       /*
+        * The node is fully used and the tasks don't compete for more CPU
+        * cycles. Nevertheless, some tasks might wait before running.
+        */
+       node_fully_busy,
+       /*
+        * The node is overloaded and can't provide expected CPU cycles to all
+        * tasks.
+        */
+       node_overloaded
+};
 
 /* Cached statistics for all CPUs within a node */
 struct numa_stats {
        unsigned long load;
-
+       unsigned long util;
        /* Total compute capacity of CPUs on a node */
        unsigned long compute_capacity;
+       unsigned int nr_running;
+       unsigned int weight;
+       enum numa_type node_type;
 };
 
-/*
- * XXX borrowed from update_sg_lb_stats
- */
-static void update_numa_stats(struct numa_stats *ns, int nid)
-{
-       int cpu;
-
-       memset(ns, 0, sizeof(*ns));
-       for_each_cpu(cpu, cpumask_of_node(nid)) {
-               struct rq *rq = cpu_rq(cpu);
-
-               ns->load += cpu_runnable_load(rq);
-               ns->compute_capacity += capacity_of(cpu);
-       }
-
-}
-
 struct task_numa_env {
        struct task_struct *p;
 
@@ -1521,6 +1518,47 @@ struct task_numa_env {
        int best_cpu;
 };
 
+static unsigned long cpu_load(struct rq *rq);
+static unsigned long cpu_util(int cpu);
+
+static inline enum
+numa_type numa_classify(unsigned int imbalance_pct,
+                        struct numa_stats *ns)
+{
+       if ((ns->nr_running > ns->weight) &&
+           ((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
+               return node_overloaded;
+
+       if ((ns->nr_running < ns->weight) ||
+           ((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
+               return node_has_spare;
+
+       return node_fully_busy;
+}
+
+/*
+ * XXX borrowed from update_sg_lb_stats
+ */
+static void update_numa_stats(struct task_numa_env *env,
+                             struct numa_stats *ns, int nid)
+{
+       int cpu;
+
+       memset(ns, 0, sizeof(*ns));
+       for_each_cpu(cpu, cpumask_of_node(nid)) {
+               struct rq *rq = cpu_rq(cpu);
+
+               ns->load += cpu_load(rq);
+               ns->util += cpu_util(cpu);
+               ns->nr_running += rq->cfs.h_nr_running;
+               ns->compute_capacity += capacity_of(cpu);
+       }
+
+       ns->weight = cpumask_weight(cpumask_of_node(nid));
+
+       ns->node_type = numa_classify(env->imbalance_pct, ns);
+}
+
 static void task_numa_assign(struct task_numa_env *env,
                             struct task_struct *p, long imp)
 {
@@ -1556,6 +1594,11 @@ static bool load_too_imbalanced(long src_load, long dst_load,
        long orig_src_load, orig_dst_load;
        long src_capacity, dst_capacity;
 
+
+       /* If dst node has spare capacity, there is no real load imbalance */
+       if (env->dst_stats.node_type == node_has_spare)
+               return false;
+
        /*
         * The load is corrected for the CPU capacity available on each node.
         *
@@ -1788,10 +1831,10 @@ static int task_numa_migrate(struct task_struct *p)
        dist = env.dist = node_distance(env.src_nid, env.dst_nid);
        taskweight = task_weight(p, env.src_nid, dist);
        groupweight = group_weight(p, env.src_nid, dist);
-       update_numa_stats(&env.src_stats, env.src_nid);
+       update_numa_stats(&env, &env.src_stats, env.src_nid);
        taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
        groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
-       update_numa_stats(&env.dst_stats, env.dst_nid);
+       update_numa_stats(&env, &env.dst_stats, env.dst_nid);
 
        /* Try to find a spot on the preferred nid. */
        task_numa_find_cpu(&env, taskimp, groupimp);
@@ -1824,7 +1867,7 @@ static int task_numa_migrate(struct task_struct *p)
 
                        env.dist = dist;
                        env.dst_nid = nid;
-                       update_numa_stats(&env.dst_stats, env.dst_nid);
+                       update_numa_stats(&env, &env.dst_stats, env.dst_nid);
                        task_numa_find_cpu(&env, taskimp, groupimp);
                }
        }
@@ -3686,11 +3729,6 @@ static void remove_entity_load_avg(struct sched_entity *se)
        raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
 }
 
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
-{
-       return cfs_rq->avg.runnable_load_avg;
-}
-
 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
 {
        return cfs_rq->avg.load_avg;