sched/psi: Fix avgs_work re-arm in psi_avgs_work()

author Chengming Zhou <zhouchengming@bytedance.com>

Fri, 14 Oct 2022 11:05:51 +0000 (19:05 +0800)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Thu, 27 Jul 2023 06:50:37 +0000 (08:50 +0200)
author Chengming Zhou <zhouchengming@bytedance.com>
Fri, 14 Oct 2022 11:05:51 +0000 (19:05 +0800)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 27 Jul 2023 06:50:37 +0000 (08:50 +0200)
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h

index 14a1ebb..1e0a0d7 100644 (file)
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -72,6 +72,9 @@ enum psi_states {
  /* Use one bit in the state mask to track TSK_ONCPU */
  #define PSI_ONCPU      (1 << NR_PSI_STATES)
  
+/* Flag whether to re-arm avgs_work, see details in get_recent_times() */
+#define PSI_STATE_RESCHEDULE   (1 << (NR_PSI_STATES + 1))
+
  enum psi_aggregators {
         PSI_AVGS = 0,
         PSI_POLL,
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c

index e83c321..02e011c 100644 (file)
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -243,6 +243,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
                              u32 *pchanged_states)
  {
         struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+       int current_cpu = raw_smp_processor_id();
+       unsigned int tasks[NR_PSI_TASK_COUNTS];
         u64 now, state_start;
         enum psi_states s;
         unsigned int seq;
@@ -257,6 +259,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
                 memcpy(times, groupc->times, sizeof(groupc->times));
                 state_mask = groupc->state_mask;
                 state_start = groupc->state_start;
+               if (cpu == current_cpu)
+                       memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
         } while (read_seqcount_retry(&groupc->seq, seq));
  
         /* Calculate state time deltas against the previous snapshot */
@@ -281,6 +285,28 @@ static void get_recent_times(struct psi_group *group, int cpu,
                 if (delta)
                         *pchanged_states |= (1 << s);
         }
+
+       /*
+        * When collect_percpu_times() from the avgs_work, we don't want to
+        * re-arm avgs_work when all CPUs are IDLE. But the current CPU running
+        * this avgs_work is never IDLE, cause avgs_work can't be shut off.
+        * So for the current CPU, we need to re-arm avgs_work only when
+        * (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs
+        * we can just check PSI_NONIDLE delta.
+        */
+       if (current_work() == &group->avgs_work.work) {
+               bool reschedule;
+
+               if (cpu == current_cpu)
+                       reschedule = tasks[NR_RUNNING] +
+                                    tasks[NR_IOWAIT] +
+                                    tasks[NR_MEMSTALL] > 1;
+               else
+                       reschedule = *pchanged_states & (1 << PSI_NONIDLE);
+
+               if (reschedule)
+                       *pchanged_states |= PSI_STATE_RESCHEDULE;
+       }
  }
  
  static void calc_avgs(unsigned long avg[3], int missed_periods,
@@ -416,7 +442,6 @@ static void psi_avgs_work(struct work_struct *work)
         struct delayed_work *dwork;
         struct psi_group *group;
         u32 changed_states;
-       bool nonidle;
         u64 now;
  
         dwork = to_delayed_work(work);
@@ -427,7 +452,6 @@ static void psi_avgs_work(struct work_struct *work)
         now = sched_clock();
  
         collect_percpu_times(group, PSI_AVGS, &changed_states);
-       nonidle = changed_states & (1 << PSI_NONIDLE);
         /*
          * If there is task activity, periodically fold the per-cpu
          * times and feed samples into the running averages. If things
@@ -438,7 +462,7 @@ static void psi_avgs_work(struct work_struct *work)
         if (now >= group->avg_next_update)
                 group->avg_next_update = update_averages(group, now);
  
-       if (nonidle) {
+       if (changed_states & PSI_STATE_RESCHEDULE) {
                 schedule_delayed_work(dwork, nsecs_to_jiffies(
                                 group->avg_next_update - now) + 1);
         }
author	Chengming Zhou <zhouchengming@bytedance.com>
	Fri, 14 Oct 2022 11:05:51 +0000 (19:05 +0800)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Thu, 27 Jul 2023 06:50:37 +0000 (08:50 +0200)
include/linux/psi_types.h		patch \| blob \| history
kernel/sched/psi.c		patch \| blob \| history