sched/numa: Take false sharing into account when adapting scan rate

[platform/adaptation/renesas_rcar/renesas_kernel.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 1473499..d26a16e 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -888,6 +888,18 @@ static unsigned int task_scan_max(struct task_struct *p)
   */
  unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
  
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+       rq->nr_numa_running += (p->numa_preferred_nid != -1);
+       rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
+}
+
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+       rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+       rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
+}
+
  struct numa_group {
         atomic_t refcount;
  
@@ -962,7 +974,7 @@ static inline unsigned long group_weight(struct task_struct *p, int nid)
         if (!total_faults)
                 return 0;
  
-       return 1200 * group_faults(p, nid) / total_faults;
+       return 1000 * group_faults(p, nid) / total_faults;
  }
  
  static unsigned long weighted_cpuload(const int cpu);
@@ -1039,13 +1051,15 @@ static void task_numa_assign(struct task_numa_env *env,
   * into account that it might be best if task running on the dst_cpu should
   * be exchanged with the source task
   */
-static void task_numa_compare(struct task_numa_env *env, long imp)
+static void task_numa_compare(struct task_numa_env *env,
+                             long taskimp, long groupimp)
  {
         struct rq *src_rq = cpu_rq(env->src_cpu);
         struct rq *dst_rq = cpu_rq(env->dst_cpu);
         struct task_struct *cur;
         long dst_load, src_load;
         long load;
+       long imp = (groupimp > 0) ? groupimp : taskimp;
  
         rcu_read_lock();
         cur = ACCESS_ONCE(dst_rq->curr);
@@ -1064,10 +1078,37 @@ static void task_numa_compare(struct task_numa_env *env, long imp)
                 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
                         goto unlock;
  
-               imp += task_weight(cur, env->src_nid) +
-                      group_weight(cur, env->src_nid) -
-                      task_weight(cur, env->dst_nid) -
-                      group_weight(cur, env->dst_nid);
+               /*
+                * If dst and source tasks are in the same NUMA group, or not
+                * in any group then look only at task weights.
+                */
+               if (cur->numa_group == env->p->numa_group) {
+                       imp = taskimp + task_weight(cur, env->src_nid) -
+                             task_weight(cur, env->dst_nid);
+                       /*
+                        * Add some hysteresis to prevent swapping the
+                        * tasks within a group over tiny differences.
+                        */
+                       if (cur->numa_group)
+                               imp -= imp/16;
+               } else {
+                       /*
+                        * Compare the group weights. If a task is all by
+                        * itself (not part of a group), use the task weight
+                        * instead.
+                        */
+                       if (env->p->numa_group)
+                               imp = groupimp;
+                       else
+                               imp = taskimp;
+
+                       if (cur->numa_group)
+                               imp += group_weight(cur, env->src_nid) -
+                                      group_weight(cur, env->dst_nid);
+                       else
+                               imp += task_weight(cur, env->src_nid) -
+                                      task_weight(cur, env->dst_nid);
+               }
         }
  
         if (imp < env->best_imp)
@@ -1117,7 +1158,8 @@ unlock:
         rcu_read_unlock();
  }
  
-static void task_numa_find_cpu(struct task_numa_env *env, long imp)
+static void task_numa_find_cpu(struct task_numa_env *env,
+                               long taskimp, long groupimp)
  {
         int cpu;
  
@@ -1127,7 +1169,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, long imp)
                         continue;
  
                 env->dst_cpu = cpu;
-               task_numa_compare(env, imp);
+               task_numa_compare(env, taskimp, groupimp);
         }
  }
  
@@ -1137,7 +1179,7 @@ static int task_numa_migrate(struct task_struct *p)
                 .p = p,
  
                 .src_cpu = task_cpu(p),
-               .src_nid = cpu_to_node(task_cpu(p)),
+               .src_nid = task_node(p),
  
                 .imbalance_pct = 112,
  
@@ -1146,9 +1188,9 @@ static int task_numa_migrate(struct task_struct *p)
                 .best_cpu = -1
         };
         struct sched_domain *sd;
-       unsigned long weight;
+       unsigned long taskweight, groupweight;
         int nid, ret;
-       long imp;
+       long taskimp, groupimp;
  
         /*
          * Pick the lowest SD_NUMA domain, as that would have the smallest
@@ -1163,15 +1205,17 @@ static int task_numa_migrate(struct task_struct *p)
         env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
         rcu_read_unlock();
  
-       weight = task_weight(p, env.src_nid) + group_weight(p, env.src_nid);
+       taskweight = task_weight(p, env.src_nid);
+       groupweight = group_weight(p, env.src_nid);
         update_numa_stats(&env.src_stats, env.src_nid);
         env.dst_nid = p->numa_preferred_nid;
-       imp = task_weight(p, env.dst_nid) + group_weight(p, env.dst_nid) - weight;
+       taskimp = task_weight(p, env.dst_nid) - taskweight;
+       groupimp = group_weight(p, env.dst_nid) - groupweight;
         update_numa_stats(&env.dst_stats, env.dst_nid);
  
         /* If the preferred nid has capacity, try to use it. */
         if (env.dst_stats.has_capacity)
-               task_numa_find_cpu(&env, imp);
+               task_numa_find_cpu(&env, taskimp, groupimp);
  
         /* No space available on the preferred nid. Look elsewhere. */
         if (env.best_cpu == -1) {
@@ -1180,13 +1224,14 @@ static int task_numa_migrate(struct task_struct *p)
                                 continue;
  
                         /* Only consider nodes where both task and groups benefit */
-                       imp = task_weight(p, nid) + group_weight(p, nid) - weight;
-                       if (imp < 0)
+                       taskimp = task_weight(p, nid) - taskweight;
+                       groupimp = group_weight(p, nid) - groupweight;
+                       if (taskimp < 0 && groupimp < 0)
                                 continue;
  
                         env.dst_nid = nid;
                         update_numa_stats(&env.dst_stats, env.dst_nid);
-                       task_numa_find_cpu(&env, imp);
+                       task_numa_find_cpu(&env, taskimp, groupimp);
                 }
         }
  
@@ -1194,6 +1239,8 @@ static int task_numa_migrate(struct task_struct *p)
         if (env.best_cpu == -1)
                 return -EAGAIN;
  
+       sched_setnuma(p, env.dst_nid);
+
         if (env.best_task == NULL) {
                 int ret = migrate_task_to(p, env.best_cpu);
                 return ret;
@@ -1309,8 +1356,7 @@ static void task_numa_placement(struct task_struct *p)
         /* Preferred node as the node with the most faults */
         if (max_faults && max_nid != p->numa_preferred_nid) {
                 /* Update the preferred nid and migrate task if possible */
-               p->numa_preferred_nid = max_nid;
-               p->numa_migrate_seq = 1;
+               sched_setnuma(p, max_nid);
                 numa_migrate_preferred(p);
         }
  }
@@ -1335,7 +1381,8 @@ static void double_lock(spinlock_t *l1, spinlock_t *l2)
         spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
  }
  
-static void task_numa_group(struct task_struct *p, int cpupid)
+static void task_numa_group(struct task_struct *p, int cpupid, int flags,
+                       int *priv)
  {
         struct numa_group *grp, *my_grp;
         struct task_struct *tsk;
@@ -1393,10 +1440,19 @@ static void task_numa_group(struct task_struct *p, int cpupid)
         if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
                 goto unlock;
  
-       if (!get_numa_group(grp))
-               goto unlock;
+       /* Always join threads in the same process. */
+       if (tsk->mm == current->mm)
+               join = true;
+
+       /* Simple filter to avoid false positives due to PID collisions */
+       if (flags & TNF_SHARED)
+               join = true;
  
-       join = true;
+       /* Update priv based on whether false sharing was detected */
+       *priv = !join;
+
+       if (join && !get_numa_group(grp))
+               join = false;
  
  unlock:
         rcu_read_unlock();
@@ -1493,7 +1549,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
         } else {
                 priv = cpupid_match_pid(p, last_cpupid);
                 if (!priv && !(flags & TNF_NO_GROUP))
-                       task_numa_group(p, last_cpupid);
+                       task_numa_group(p, last_cpupid, flags, &priv);
         }
  
         /*
@@ -1515,6 +1571,9 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
         if (p->numa_migrate_retry && time_after(jiffies, p->numa_migrate_retry))
                 numa_migrate_preferred(p);
  
+       if (migrated)
+               p->numa_pages_migrated += pages;
+
         p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
  }
  
@@ -1705,6 +1764,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
  static void task_tick_numa(struct rq *rq, struct task_struct *curr)
  {
  }
+
+static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+}
  #endif /* CONFIG_NUMA_BALANCING */
  
  static void
@@ -1714,8 +1781,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
         if (!parent_entity(se))
                 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
  #ifdef CONFIG_SMP
-       if (entity_is_task(se))
-               list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+       if (entity_is_task(se)) {
+               struct rq *rq = rq_of(cfs_rq);
+
+               account_numa_enqueue(rq, task_of(se));
+               list_add(&se->group_node, &rq->cfs_tasks);
+       }
  #endif
         cfs_rq->nr_running++;
  }
@@ -1726,8 +1797,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
         update_load_sub(&cfs_rq->load, se->load.weight);
         if (!parent_entity(se))
                 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
-       if (entity_is_task(se))
+       if (entity_is_task(se)) {
+               account_numa_dequeue(rq_of(cfs_rq), task_of(se));
                 list_del_init(&se->group_node);
+       }
         cfs_rq->nr_running--;
  }
  
@@ -4569,6 +4642,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  
  static unsigned long __read_mostly max_load_balance_interval = HZ/10;
  
+enum fbq_type { regular, remote, all };
+
  #define LBF_ALL_PINNED 0x01
  #define LBF_NEED_BREAK 0x02
  #define LBF_DST_PINNED  0x04
@@ -4595,6 +4670,8 @@ struct lb_env {
         unsigned int            loop;
         unsigned int            loop_break;
         unsigned int            loop_max;
+
+       enum fbq_type           fbq_type;
  };
  
  /*
@@ -4676,10 +4753,9 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
         if (dst_nid == p->numa_preferred_nid)
                 return true;
  
-       /* After the task has settled, check if the new node is better. */
-       if (p->numa_migrate_seq >= sysctl_numa_balancing_settle_count &&
-                       task_weight(p, dst_nid) + group_weight(p, dst_nid) >
-                       task_weight(p, src_nid) + group_weight(p, src_nid))
+       /* If both task and group weight improve, this move is a winner. */
+       if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
+           group_weight(p, dst_nid) > group_weight(p, src_nid))
                 return true;
  
         return false;
@@ -4706,10 +4782,9 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
         if (src_nid == p->numa_preferred_nid)
                 return true;
  
-       /* After the task has settled, check if the new node is worse. */
-       if (p->numa_migrate_seq >= sysctl_numa_balancing_settle_count &&
-                       task_weight(p, dst_nid) + group_weight(p, dst_nid) <
-                       task_weight(p, src_nid) + group_weight(p, src_nid))
+       /* If either task or group weight get worse, don't do it. */
+       if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
+           group_weight(p, dst_nid) < group_weight(p, src_nid))
                 return true;
  
         return false;
@@ -5058,6 +5133,10 @@ struct sg_lb_stats {
         unsigned int group_weight;
         int group_imb; /* Is there an imbalance in the group ? */
         int group_has_capacity; /* Is there extra capacity in the group? */
+#ifdef CONFIG_NUMA_BALANCING
+       unsigned int nr_numa_running;
+       unsigned int nr_preferred_running;
+#endif
  };
  
  /*
@@ -5375,6 +5454,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
  
                 sgs->group_load += load;
                 sgs->sum_nr_running += nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+               sgs->nr_numa_running += rq->nr_numa_running;
+               sgs->nr_preferred_running += rq->nr_preferred_running;
+#endif
                 sgs->sum_weighted_load += weighted_cpuload(i);
                 if (idle_cpu(i))
                         sgs->idle_cpus++;
@@ -5440,14 +5523,43 @@ static bool update_sd_pick_busiest(struct lb_env *env,
         return false;
  }
  
+#ifdef CONFIG_NUMA_BALANCING
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+       if (sgs->sum_nr_running > sgs->nr_numa_running)
+               return regular;
+       if (sgs->sum_nr_running > sgs->nr_preferred_running)
+               return remote;
+       return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+       if (rq->nr_running > rq->nr_numa_running)
+               return regular;
+       if (rq->nr_running > rq->nr_preferred_running)
+               return remote;
+       return all;
+}
+#else
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+       return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+       return regular;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
  /**
   * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
   * @env: The load balancing environment.
   * @balance: Should we balance.
   * @sds: variable to hold the statistics for this sched_domain.
   */
-static inline void update_sd_lb_stats(struct lb_env *env,
-                                       struct sd_lb_stats *sds)
+static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
  {
         struct sched_domain *child = env->sd->child;
         struct sched_group *sg = env->sd->groups;
@@ -5504,6 +5616,9 @@ next_group:
  
                 sg = sg->next;
         } while (sg != env->sd->groups);
+
+       if (env->sd->flags & SD_NUMA)
+               env->fbq_type = fbq_classify_group(&sds->busiest_stat);
  }
  
  /**
@@ -5807,15 +5922,39 @@ static struct rq *find_busiest_queue(struct lb_env *env,
         int i;
  
         for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
-               unsigned long power = power_of(i);
-               unsigned long capacity = DIV_ROUND_CLOSEST(power,
-                                                          SCHED_POWER_SCALE);
-               unsigned long wl;
+               unsigned long power, capacity, wl;
+               enum fbq_type rt;
+
+               rq = cpu_rq(i);
+               rt = fbq_classify_rq(rq);
  
+               /*
+                * We classify groups/runqueues into three groups:
+                *  - regular: there are !numa tasks
+                *  - remote:  there are numa tasks that run on the 'wrong' node
+                *  - all:     there is no distinction
+                *
+                * In order to avoid migrating ideally placed numa tasks,
+                * ignore those when there's better options.
+                *
+                * If we ignore the actual busiest queue to migrate another
+                * task, the next balance pass can still reduce the busiest
+                * queue by moving tasks around inside the node.
+                *
+                * If we cannot move enough load due to this classification
+                * the next pass will adjust the group classification and
+                * allow migration of more tasks.
+                *
+                * Both cases only affect the total convergence complexity.
+                */
+               if (rt > env->fbq_type)
+                       continue;
+
+               power = power_of(i);
+               capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
                 if (!capacity)
                         capacity = fix_small_capacity(env->sd, group);
  
-               rq = cpu_rq(i);
                 wl = weighted_cpuload(i);
  
                 /*
@@ -5932,6 +6071,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                 .idle           = idle,
                 .loop_break     = sched_nr_migrate_break,
                 .cpus           = cpus,
+               .fbq_type       = all,
         };
  
         /*