sched/numa: Take false sharing into account when adapting scan rate
[platform/adaptation/renesas_rcar/renesas_kernel.git] / kernel / sched / fair.c
index 1473499..d26a16e 100644 (file)
@@ -888,6 +888,18 @@ static unsigned int task_scan_max(struct task_struct *p)
  */
 unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
 
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+       rq->nr_numa_running += (p->numa_preferred_nid != -1);
+       rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
+}
+
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+       rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+       rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
+}
+
 struct numa_group {
        atomic_t refcount;
 
@@ -962,7 +974,7 @@ static inline unsigned long group_weight(struct task_struct *p, int nid)
        if (!total_faults)
                return 0;
 
-       return 1200 * group_faults(p, nid) / total_faults;
+       return 1000 * group_faults(p, nid) / total_faults;
 }
 
 static unsigned long weighted_cpuload(const int cpu);
@@ -1039,13 +1051,15 @@ static void task_numa_assign(struct task_numa_env *env,
  * into account that it might be best if task running on the dst_cpu should
  * be exchanged with the source task
  */
-static void task_numa_compare(struct task_numa_env *env, long imp)
+static void task_numa_compare(struct task_numa_env *env,
+                             long taskimp, long groupimp)
 {
        struct rq *src_rq = cpu_rq(env->src_cpu);
        struct rq *dst_rq = cpu_rq(env->dst_cpu);
        struct task_struct *cur;
        long dst_load, src_load;
        long load;
+       long imp = (groupimp > 0) ? groupimp : taskimp;
 
        rcu_read_lock();
        cur = ACCESS_ONCE(dst_rq->curr);
@@ -1064,10 +1078,37 @@ static void task_numa_compare(struct task_numa_env *env, long imp)
                if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
                        goto unlock;
 
-               imp += task_weight(cur, env->src_nid) +
-                      group_weight(cur, env->src_nid) -
-                      task_weight(cur, env->dst_nid) -
-                      group_weight(cur, env->dst_nid);
+               /*
+                * If dst and source tasks are in the same NUMA group, or not
+                * in any group then look only at task weights.
+                */
+               if (cur->numa_group == env->p->numa_group) {
+                       imp = taskimp + task_weight(cur, env->src_nid) -
+                             task_weight(cur, env->dst_nid);
+                       /*
+                        * Add some hysteresis to prevent swapping the
+                        * tasks within a group over tiny differences.
+                        */
+                       if (cur->numa_group)
+                               imp -= imp/16;
+               } else {
+                       /*
+                        * Compare the group weights. If a task is all by
+                        * itself (not part of a group), use the task weight
+                        * instead.
+                        */
+                       if (env->p->numa_group)
+                               imp = groupimp;
+                       else
+                               imp = taskimp;
+
+                       if (cur->numa_group)
+                               imp += group_weight(cur, env->src_nid) -
+                                      group_weight(cur, env->dst_nid);
+                       else
+                               imp += task_weight(cur, env->src_nid) -
+                                      task_weight(cur, env->dst_nid);
+               }
        }
 
        if (imp < env->best_imp)
@@ -1117,7 +1158,8 @@ unlock:
        rcu_read_unlock();
 }
 
-static void task_numa_find_cpu(struct task_numa_env *env, long imp)
+static void task_numa_find_cpu(struct task_numa_env *env,
+                               long taskimp, long groupimp)
 {
        int cpu;
 
@@ -1127,7 +1169,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, long imp)
                        continue;
 
                env->dst_cpu = cpu;
-               task_numa_compare(env, imp);
+               task_numa_compare(env, taskimp, groupimp);
        }
 }
 
@@ -1137,7 +1179,7 @@ static int task_numa_migrate(struct task_struct *p)
                .p = p,
 
                .src_cpu = task_cpu(p),
-               .src_nid = cpu_to_node(task_cpu(p)),
+               .src_nid = task_node(p),
 
                .imbalance_pct = 112,
 
@@ -1146,9 +1188,9 @@ static int task_numa_migrate(struct task_struct *p)
                .best_cpu = -1
        };
        struct sched_domain *sd;
-       unsigned long weight;
+       unsigned long taskweight, groupweight;
        int nid, ret;
-       long imp;
+       long taskimp, groupimp;
 
        /*
         * Pick the lowest SD_NUMA domain, as that would have the smallest
@@ -1163,15 +1205,17 @@ static int task_numa_migrate(struct task_struct *p)
        env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
        rcu_read_unlock();
 
-       weight = task_weight(p, env.src_nid) + group_weight(p, env.src_nid);
+       taskweight = task_weight(p, env.src_nid);
+       groupweight = group_weight(p, env.src_nid);
        update_numa_stats(&env.src_stats, env.src_nid);
        env.dst_nid = p->numa_preferred_nid;
-       imp = task_weight(p, env.dst_nid) + group_weight(p, env.dst_nid) - weight;
+       taskimp = task_weight(p, env.dst_nid) - taskweight;
+       groupimp = group_weight(p, env.dst_nid) - groupweight;
        update_numa_stats(&env.dst_stats, env.dst_nid);
 
        /* If the preferred nid has capacity, try to use it. */
        if (env.dst_stats.has_capacity)
-               task_numa_find_cpu(&env, imp);
+               task_numa_find_cpu(&env, taskimp, groupimp);
 
        /* No space available on the preferred nid. Look elsewhere. */
        if (env.best_cpu == -1) {
@@ -1180,13 +1224,14 @@ static int task_numa_migrate(struct task_struct *p)
                                continue;
 
                        /* Only consider nodes where both task and groups benefit */
-                       imp = task_weight(p, nid) + group_weight(p, nid) - weight;
-                       if (imp < 0)
+                       taskimp = task_weight(p, nid) - taskweight;
+                       groupimp = group_weight(p, nid) - groupweight;
+                       if (taskimp < 0 && groupimp < 0)
                                continue;
 
                        env.dst_nid = nid;
                        update_numa_stats(&env.dst_stats, env.dst_nid);
-                       task_numa_find_cpu(&env, imp);
+                       task_numa_find_cpu(&env, taskimp, groupimp);
                }
        }
 
@@ -1194,6 +1239,8 @@ static int task_numa_migrate(struct task_struct *p)
        if (env.best_cpu == -1)
                return -EAGAIN;
 
+       sched_setnuma(p, env.dst_nid);
+
        if (env.best_task == NULL) {
                int ret = migrate_task_to(p, env.best_cpu);
                return ret;
@@ -1309,8 +1356,7 @@ static void task_numa_placement(struct task_struct *p)
        /* Preferred node as the node with the most faults */
        if (max_faults && max_nid != p->numa_preferred_nid) {
                /* Update the preferred nid and migrate task if possible */
-               p->numa_preferred_nid = max_nid;
-               p->numa_migrate_seq = 1;
+               sched_setnuma(p, max_nid);
                numa_migrate_preferred(p);
        }
 }
@@ -1335,7 +1381,8 @@ static void double_lock(spinlock_t *l1, spinlock_t *l2)
        spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
 }
 
-static void task_numa_group(struct task_struct *p, int cpupid)
+static void task_numa_group(struct task_struct *p, int cpupid, int flags,
+                       int *priv)
 {
        struct numa_group *grp, *my_grp;
        struct task_struct *tsk;
@@ -1393,10 +1440,19 @@ static void task_numa_group(struct task_struct *p, int cpupid)
        if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
                goto unlock;
 
-       if (!get_numa_group(grp))
-               goto unlock;
+       /* Always join threads in the same process. */
+       if (tsk->mm == current->mm)
+               join = true;
+
+       /* Simple filter to avoid false positives due to PID collisions */
+       if (flags & TNF_SHARED)
+               join = true;
 
-       join = true;
+       /* Update priv based on whether false sharing was detected */
+       *priv = !join;
+
+       if (join && !get_numa_group(grp))
+               join = false;
 
 unlock:
        rcu_read_unlock();
@@ -1493,7 +1549,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
        } else {
                priv = cpupid_match_pid(p, last_cpupid);
                if (!priv && !(flags & TNF_NO_GROUP))
-                       task_numa_group(p, last_cpupid);
+                       task_numa_group(p, last_cpupid, flags, &priv);
        }
 
        /*
@@ -1515,6 +1571,9 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
        if (p->numa_migrate_retry && time_after(jiffies, p->numa_migrate_retry))
                numa_migrate_preferred(p);
 
+       if (migrated)
+               p->numa_pages_migrated += pages;
+
        p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
 }
 
@@ -1705,6 +1764,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 {
 }
+
+static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 static void
@@ -1714,8 +1781,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        if (!parent_entity(se))
                update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
 #ifdef CONFIG_SMP
-       if (entity_is_task(se))
-               list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+       if (entity_is_task(se)) {
+               struct rq *rq = rq_of(cfs_rq);
+
+               account_numa_enqueue(rq, task_of(se));
+               list_add(&se->group_node, &rq->cfs_tasks);
+       }
 #endif
        cfs_rq->nr_running++;
 }
@@ -1726,8 +1797,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        update_load_sub(&cfs_rq->load, se->load.weight);
        if (!parent_entity(se))
                update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
-       if (entity_is_task(se))
+       if (entity_is_task(se)) {
+               account_numa_dequeue(rq_of(cfs_rq), task_of(se));
                list_del_init(&se->group_node);
+       }
        cfs_rq->nr_running--;
 }
 
@@ -4569,6 +4642,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 
 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 
+enum fbq_type { regular, remote, all };
+
 #define LBF_ALL_PINNED 0x01
 #define LBF_NEED_BREAK 0x02
 #define LBF_DST_PINNED  0x04
@@ -4595,6 +4670,8 @@ struct lb_env {
        unsigned int            loop;
        unsigned int            loop_break;
        unsigned int            loop_max;
+
+       enum fbq_type           fbq_type;
 };
 
 /*
@@ -4676,10 +4753,9 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
        if (dst_nid == p->numa_preferred_nid)
                return true;
 
-       /* After the task has settled, check if the new node is better. */
-       if (p->numa_migrate_seq >= sysctl_numa_balancing_settle_count &&
-                       task_weight(p, dst_nid) + group_weight(p, dst_nid) >
-                       task_weight(p, src_nid) + group_weight(p, src_nid))
+       /* If both task and group weight improve, this move is a winner. */
+       if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
+           group_weight(p, dst_nid) > group_weight(p, src_nid))
                return true;
 
        return false;
@@ -4706,10 +4782,9 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
        if (src_nid == p->numa_preferred_nid)
                return true;
 
-       /* After the task has settled, check if the new node is worse. */
-       if (p->numa_migrate_seq >= sysctl_numa_balancing_settle_count &&
-                       task_weight(p, dst_nid) + group_weight(p, dst_nid) <
-                       task_weight(p, src_nid) + group_weight(p, src_nid))
+       /* If either task or group weight get worse, don't do it. */
+       if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
+           group_weight(p, dst_nid) < group_weight(p, src_nid))
                return true;
 
        return false;
@@ -5058,6 +5133,10 @@ struct sg_lb_stats {
        unsigned int group_weight;
        int group_imb; /* Is there an imbalance in the group ? */
        int group_has_capacity; /* Is there extra capacity in the group? */
+#ifdef CONFIG_NUMA_BALANCING
+       unsigned int nr_numa_running;
+       unsigned int nr_preferred_running;
+#endif
 };
 
 /*
@@ -5375,6 +5454,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 
                sgs->group_load += load;
                sgs->sum_nr_running += nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+               sgs->nr_numa_running += rq->nr_numa_running;
+               sgs->nr_preferred_running += rq->nr_preferred_running;
+#endif
                sgs->sum_weighted_load += weighted_cpuload(i);
                if (idle_cpu(i))
                        sgs->idle_cpus++;
@@ -5440,14 +5523,43 @@ static bool update_sd_pick_busiest(struct lb_env *env,
        return false;
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+       if (sgs->sum_nr_running > sgs->nr_numa_running)
+               return regular;
+       if (sgs->sum_nr_running > sgs->nr_preferred_running)
+               return remote;
+       return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+       if (rq->nr_running > rq->nr_numa_running)
+               return regular;
+       if (rq->nr_running > rq->nr_preferred_running)
+               return remote;
+       return all;
+}
+#else
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+       return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+       return regular;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
 /**
  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
  * @env: The load balancing environment.
  * @balance: Should we balance.
  * @sds: variable to hold the statistics for this sched_domain.
  */
-static inline void update_sd_lb_stats(struct lb_env *env,
-                                       struct sd_lb_stats *sds)
+static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
 {
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
@@ -5504,6 +5616,9 @@ next_group:
 
                sg = sg->next;
        } while (sg != env->sd->groups);
+
+       if (env->sd->flags & SD_NUMA)
+               env->fbq_type = fbq_classify_group(&sds->busiest_stat);
 }
 
 /**
@@ -5807,15 +5922,39 @@ static struct rq *find_busiest_queue(struct lb_env *env,
        int i;
 
        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
-               unsigned long power = power_of(i);
-               unsigned long capacity = DIV_ROUND_CLOSEST(power,
-                                                          SCHED_POWER_SCALE);
-               unsigned long wl;
+               unsigned long power, capacity, wl;
+               enum fbq_type rt;
+
+               rq = cpu_rq(i);
+               rt = fbq_classify_rq(rq);
 
+               /*
+                * We classify groups/runqueues into three groups:
+                *  - regular: there are !numa tasks
+                *  - remote:  there are numa tasks that run on the 'wrong' node
+                *  - all:     there is no distinction
+                *
+                * In order to avoid migrating ideally placed numa tasks,
+                * ignore those when there's better options.
+                *
+                * If we ignore the actual busiest queue to migrate another
+                * task, the next balance pass can still reduce the busiest
+                * queue by moving tasks around inside the node.
+                *
+                * If we cannot move enough load due to this classification
+                * the next pass will adjust the group classification and
+                * allow migration of more tasks.
+                *
+                * Both cases only affect the total convergence complexity.
+                */
+               if (rt > env->fbq_type)
+                       continue;
+
+               power = power_of(i);
+               capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
                if (!capacity)
                        capacity = fix_small_capacity(env->sd, group);
 
-               rq = cpu_rq(i);
                wl = weighted_cpuload(i);
 
                /*
@@ -5932,6 +6071,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .idle           = idle,
                .loop_break     = sched_nr_migrate_break,
                .cpus           = cpus,
+               .fbq_type       = all,
        };
 
        /*