psi: Fix cpu.pressure for cpu.max and competing cgroups
authorJohannes Weiner <hannes@cmpxchg.org>
Mon, 16 Mar 2020 19:13:31 +0000 (15:13 -0400)
committerPeter Zijlstra <peterz@infradead.org>
Fri, 20 Mar 2020 12:06:18 +0000 (13:06 +0100)
For simplicity, cpu pressure is defined as having more than one
runnable task on a given CPU. This works on the system-level, but it
has limitations in a cgrouped reality: When cpu.max is in use, it
doesn't capture the time in which a task is not executing on the CPU
due to throttling. Likewise, it doesn't capture the time in which a
competing cgroup is occupying the CPU - meaning it only reflects
cgroup-internal competitive pressure, not outside pressure.

Enable tracking of currently executing tasks, and then change the
definition of cpu pressure in a cgroup from

NR_RUNNING > 1

to

NR_RUNNING > ON_CPU

which will capture the effects of cpu.max as well as competition from
outside the cgroup.

After this patch, a cgroup running `stress -c 1` with a cpu.max
setting of 5000 10000 shows ~50% continuous CPU pressure.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200316191333.115523-2-hannes@cmpxchg.org
include/linux/psi_types.h
kernel/sched/core.c
kernel/sched/psi.c
kernel/sched/stats.h

index 07aaf9b..4b72584 100644 (file)
@@ -14,13 +14,21 @@ enum psi_task_count {
        NR_IOWAIT,
        NR_MEMSTALL,
        NR_RUNNING,
-       NR_PSI_TASK_COUNTS = 3,
+       /*
+        * This can't have values other than 0 or 1 and could be
+        * implemented as a bit flag. But for now we still have room
+        * in the first cacheline of psi_group_cpu, and this way we
+        * don't have to special case any state tracking for it.
+        */
+       NR_ONCPU,
+       NR_PSI_TASK_COUNTS = 4,
 };
 
 /* Task state bitmasks */
 #define TSK_IOWAIT     (1 << NR_IOWAIT)
 #define TSK_MEMSTALL   (1 << NR_MEMSTALL)
 #define TSK_RUNNING    (1 << NR_RUNNING)
+#define TSK_ONCPU      (1 << NR_ONCPU)
 
 /* Resources that workloads could be stalled on */
 enum psi_res {
index 014d4f7..c1f923d 100644 (file)
@@ -4091,6 +4091,8 @@ static void __sched notrace __schedule(bool preempt)
                 */
                ++*switch_count;
 
+               psi_sched_switch(prev, next, !task_on_rq_queued(prev));
+
                trace_sched_switch(preempt, prev, next);
 
                /* Also unlocks the rq: */
index 0285207..5012829 100644 (file)
@@ -225,7 +225,7 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
        case PSI_MEM_FULL:
                return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
        case PSI_CPU_SOME:
-               return tasks[NR_RUNNING] > 1;
+               return tasks[NR_RUNNING] > tasks[NR_ONCPU];
        case PSI_NONIDLE:
                return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
                        tasks[NR_RUNNING];
@@ -695,10 +695,10 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
                if (!(m & (1 << t)))
                        continue;
                if (groupc->tasks[t] == 0 && !psi_bug) {
-                       printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n",
+                       printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
                                        cpu, t, groupc->tasks[0],
                                        groupc->tasks[1], groupc->tasks[2],
-                                       clear, set);
+                                       groupc->tasks[3], clear, set);
                        psi_bug = 1;
                }
                groupc->tasks[t]--;
@@ -916,9 +916,11 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
 
        rq = task_rq_lock(task, &rf);
 
-       if (task_on_rq_queued(task))
+       if (task_on_rq_queued(task)) {
                task_flags = TSK_RUNNING;
-       else if (task->in_iowait)
+               if (task_current(rq, task))
+                       task_flags |= TSK_ONCPU;
+       } else if (task->in_iowait)
                task_flags = TSK_IOWAIT;
 
        if (task->flags & PF_MEMSTALL)
index ba683fe..6ff0ac1 100644 (file)
@@ -93,6 +93,14 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
                if (p->flags & PF_MEMSTALL)
                        clear |= TSK_MEMSTALL;
        } else {
+               /*
+                * When a task sleeps, schedule() dequeues it before
+                * switching to the next one. Merge the clearing of
+                * TSK_RUNNING and TSK_ONCPU to save an unnecessary
+                * psi_task_change() call in psi_sched_switch().
+                */
+               clear |= TSK_ONCPU;
+
                if (p->in_iowait)
                        set |= TSK_IOWAIT;
        }
@@ -126,6 +134,23 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
        }
 }
 
+static inline void psi_sched_switch(struct task_struct *prev,
+                                   struct task_struct *next,
+                                   bool sleep)
+{
+       if (static_branch_likely(&psi_disabled))
+               return;
+
+       /*
+        * Clear the TSK_ONCPU state if the task was preempted. If
+        * it's a voluntary sleep, dequeue will have taken care of it.
+        */
+       if (!sleep)
+               psi_task_change(prev, TSK_ONCPU, 0);
+
+       psi_task_change(next, 0, TSK_ONCPU);
+}
+
 static inline void psi_task_tick(struct rq *rq)
 {
        if (static_branch_likely(&psi_disabled))
@@ -138,6 +163,9 @@ static inline void psi_task_tick(struct rq *rq)
 static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
 static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
 static inline void psi_ttwu_dequeue(struct task_struct *p) {}
+static inline void psi_sched_switch(struct task_struct *prev,
+                                   struct task_struct *next,
+                                   bool sleep) {}
 static inline void psi_task_tick(struct rq *rq) {}
 #endif /* CONFIG_PSI */