sched/numa: Calculate node scores in complex NUMA topologies

[platform/kernel/linux-rpi.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 0af3bed..7e5712a 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -925,6 +925,71 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
                 group->faults_cpu[task_faults_idx(nid, 1)];
  }
  
+/* Handle placement on systems where not all nodes are directly connected. */
+static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
+                                       int maxdist, bool task)
+{
+       unsigned long score = 0;
+       int node;
+
+       /*
+        * All nodes are directly connected, and the same distance
+        * from each other. No need for fancy placement algorithms.
+        */
+       if (sched_numa_topology_type == NUMA_DIRECT)
+               return 0;
+
+       /*
+        * This code is called for each node, introducing N^2 complexity,
+        * which should be ok given the number of nodes rarely exceeds 8.
+        */
+       for_each_online_node(node) {
+               unsigned long faults;
+               int dist = node_distance(nid, node);
+
+               /*
+                * The furthest away nodes in the system are not interesting
+                * for placement; nid was already counted.
+                */
+               if (dist == sched_max_numa_distance || node == nid)
+                       continue;
+
+               /*
+                * On systems with a backplane NUMA topology, compare groups
+                * of nodes, and move tasks towards the group with the most
+                * memory accesses. When comparing two nodes at distance
+                * "hoplimit", only nodes closer by than "hoplimit" are part
+                * of each group. Skip other nodes.
+                */
+               if (sched_numa_topology_type == NUMA_BACKPLANE &&
+                                       dist > maxdist)
+                       continue;
+
+               /* Add up the faults from nearby nodes. */
+               if (task)
+                       faults = task_faults(p, node);
+               else
+                       faults = group_faults(p, node);
+
+               /*
+                * On systems with a glueless mesh NUMA topology, there are
+                * no fixed "groups of nodes". Instead, nodes that are not
+                * directly connected bounce traffic through intermediate
+                * nodes; a numa_group can occupy any set of nodes.
+                * The further away a node is, the less the faults count.
+                * This seems to result in good task placement.
+                */
+               if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+                       faults *= (sched_max_numa_distance - dist);
+                       faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+               }
+
+               score += faults;
+       }
+
+       return score;
+}
+
  /*
   * These return the fraction of accesses done by a particular task, or
   * task group, on a particular numa node.  The group weight is given a
@@ -945,6 +1010,8 @@ static inline unsigned long task_weight(struct task_struct *p, int nid,
                 return 0;
  
         faults = task_faults(p, nid);
+       faults += score_nearby_nodes(p, nid, dist, true);
+
         return 1000 * faults / total_faults;
  }
  
@@ -962,6 +1029,8 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
                 return 0;
  
         faults = group_faults(p, nid);
+       faults += score_nearby_nodes(p, nid, dist, false);
+
         return 1000 * faults / total_faults;
  }
  
@@ -1374,6 +1443,11 @@ static int task_numa_migrate(struct task_struct *p)
                                 continue;
  
                         dist = node_distance(env.src_nid, env.dst_nid);
+                       if (sched_numa_topology_type == NUMA_BACKPLANE &&
+                                               dist != env.dist) {
+                               taskweight = task_weight(p, env.src_nid, dist);
+                               groupweight = group_weight(p, env.src_nid, dist);
+                       }
  
                         /* Only consider nodes where both task and groups benefit */
                         taskimp = task_weight(p, nid, dist) - taskweight;