workqueue: Automatically mark CPU-hogging work items CPU_INTENSIVE

author Tejun Heo <tj@kernel.org>

Thu, 18 May 2023 03:02:08 +0000 (17:02 -1000)

committer Tejun Heo <tj@kernel.org>

Thu, 18 May 2023 03:02:08 +0000 (17:02 -1000)
author Tejun Heo <tj@kernel.org>
Thu, 18 May 2023 03:02:08 +0000 (17:02 -1000)
committer Tejun Heo <tj@kernel.org>
Thu, 18 May 2023 03:02:08 +0000 (17:02 -1000)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt

index 9e5bab29685ff0534fe82714dda5e20804c38ead..1f2185cf2f0a1f3cf75be7a631595dc38c5af9df 100644 (file)
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6931,6 +6931,13 @@
                         it can be updated at runtime by writing to the
                         corresponding sysfs file.
  
+       workqueue.cpu_intensive_thresh_us=
+                       Per-cpu work items which run for longer than this
+                       threshold are automatically considered CPU intensive
+                       and excluded from concurrency management to prevent
+                       them from noticeably delaying other per-cpu work
+                       items. Default is 10000 (10ms).
+
         workqueue.disable_numa
                         By default, all work items queued to unbound
                         workqueues are affine to the NUMA nodes they're
diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst

index 7e5c39310bbfd074063bed5c6d476ec0eb36b804..a389f31b025c9bad14921fe89ba4dc386ca4674f 100644 (file)
--- a/Documentation/core-api/workqueue.rst
+++ b/Documentation/core-api/workqueue.rst
@@ -354,25 +354,25 @@ Monitoring
  Use tools/workqueue/wq_monitor.py to monitor workqueue operations: ::
  
    $ tools/workqueue/wq_monitor.py events
-                              total  infl  CMwake  mayday rescued
-  events                      18545     0       5       -       -
-  events_highpri                  8     0       0       -       -
-  events_long                     3     0       0       -       -
-  events_unbound              38306     0       -       -       -
-  events_freezable                0     0       0       -       -
-  events_power_efficient      29598     0       0       -       -
-  events_freezable_power_        10     0       0       -       -
-  sock_diag_events                0     0       0       -       -
-
-                              total  infl  CMwake  mayday rescued
-  events                      18548     0       5       -       -
-  events_highpri                  8     0       0       -       -
-  events_long                     3     0       0       -       -
-  events_unbound              38322     0       -       -       -
-  events_freezable                0     0       0       -       -
-  events_power_efficient      29603     0       0       -       -
-  events_freezable_power_        10     0       0       -       -
-  sock_diag_events                0     0       0       -       -
+                              total  infl CPUitsv  CMwake  mayday rescued
+  events                      18545     0       0       5       -       -
+  events_highpri                  8     0       0       0       -       -
+  events_long                     3     0       0       0       -       -
+  events_unbound              38306     0       -       -       -       -
+  events_freezable                0     0       0       0       -       -
+  events_power_efficient      29598     0       0       0       -       -
+  events_freezable_power_        10     0       0       0       -       -
+  sock_diag_events                0     0       0       0       -       -
+
+                              total  infl CPUitsv  CMwake  mayday rescued
+  events                      18548     0       0       5       -       -
+  events_highpri                  8     0       0       0       -       -
+  events_long                     3     0       0       0       -       -
+  events_unbound              38322     0       -       -       -       -
+  events_freezable                0     0       0       0       -       -
+  events_power_efficient      29603     0       0       0       -       -
+  events_freezable_power_        10     0       0       0       -       -
+  sock_diag_events                0     0       0       0       -       -
  
    ...
  
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 944c3ae39861cf68d344787a5487a81e92b1f9b0..3484cada9a4a0b1f6aadf85fa3abf1ecdef02417 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5632,6 +5632,9 @@ void scheduler_tick(void)
  
         perf_event_task_tick();
  
+       if (curr->flags & PF_WQ_WORKER)
+               wq_worker_tick(curr);
+
  #ifdef CONFIG_SMP
         rq->idle_balance = idle_cpu(cpu);
         trigger_load_balance(rq);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c

index 9424214436036eb90578a1f887e4a166a8cbf733..3dc83d5eba506013c72297b732d53ff7c2317ee3 100644 (file)
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -212,6 +212,7 @@ struct worker_pool {
  enum pool_workqueue_stats {
         PWQ_STAT_STARTED,       /* work items started execution */
         PWQ_STAT_COMPLETED,     /* work items completed execution */
+       PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */
         PWQ_STAT_CM_WAKEUP,     /* concurrency-management worker wakeups */
         PWQ_STAT_MAYDAY,        /* maydays to rescuer */
         PWQ_STAT_RESCUED,       /* linked work items executed by rescuer */
@@ -332,6 +333,14 @@ static struct kmem_cache *pwq_cache;
  static cpumask_var_t *wq_numa_possible_cpumask;
                                         /* possible CPUs of each node */
  
+/*
+ * Per-cpu work items which run for longer than the following threshold are
+ * automatically considered CPU intensive and excluded from concurrency
+ * management to prevent them from noticeably delaying other per-cpu work items.
+ */
+static unsigned long wq_cpu_intensive_thresh_us = 10000;
+module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
+
  static bool wq_disable_numa;
  module_param_named(disable_numa, wq_disable_numa, bool, 0444);
  
@@ -962,6 +971,13 @@ void wq_worker_running(struct task_struct *task)
         if (!(worker->flags & WORKER_NOT_RUNNING))
                 worker->pool->nr_running++;
         preempt_enable();
+
+       /*
+        * CPU intensive auto-detection cares about how long a work item hogged
+        * CPU without sleeping. Reset the starting timestamp on wakeup.
+        */
+       worker->current_at = worker->task->se.sum_exec_runtime;
+
         worker->sleeping = 0;
  }
  
@@ -1012,6 +1028,45 @@ void wq_worker_sleeping(struct task_struct *task)
         raw_spin_unlock_irq(&pool->lock);
  }
  
+/**
+ * wq_worker_tick - a scheduler tick occurred while a kworker is running
+ * @task: task currently running
+ *
+ * Called from scheduler_tick(). We're in the IRQ context and the current
+ * worker's fields which follow the 'K' locking rule can be accessed safely.
+ */
+void wq_worker_tick(struct task_struct *task)
+{
+       struct worker *worker = kthread_data(task);
+       struct pool_workqueue *pwq = worker->current_pwq;
+       struct worker_pool *pool = worker->pool;
+
+       if (!pwq)
+               return;
+
+       /*
+        * If the current worker is concurrency managed and hogged the CPU for
+        * longer than wq_cpu_intensive_thresh_us, it's automatically marked
+        * CPU_INTENSIVE to avoid stalling other concurrency-managed work items.
+        */
+       if ((worker->flags & WORKER_NOT_RUNNING) ||
+           worker->task->se.sum_exec_runtime - worker->current_at <
+           wq_cpu_intensive_thresh_us * NSEC_PER_USEC)
+               return;
+
+       raw_spin_lock(&pool->lock);
+
+       worker_set_flags(worker, WORKER_CPU_INTENSIVE);
+       pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;
+
+       if (need_more_worker(pool)) {
+               pwq->stats[PWQ_STAT_CM_WAKEUP]++;
+               wake_up_worker(pool);
+       }
+
+       raw_spin_unlock(&pool->lock);
+}
+
  /**
   * wq_worker_last_func - retrieve worker's last work function
   * @task: Task to retrieve last work function of.
@@ -2327,7 +2382,6 @@ __acquires(&pool->lock)
  {
         struct pool_workqueue *pwq = get_work_pwq(work);
         struct worker_pool *pool = worker->pool;
-       bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
         unsigned long work_data;
         struct worker *collision;
  #ifdef CONFIG_LOCKDEP
@@ -2364,6 +2418,7 @@ __acquires(&pool->lock)
         worker->current_work = work;
         worker->current_func = work->func;
         worker->current_pwq = pwq;
+       worker->current_at = worker->task->se.sum_exec_runtime;
         work_data = *work_data_bits(work);
         worker->current_color = get_work_color(work_data);
  
@@ -2381,7 +2436,7 @@ __acquires(&pool->lock)
          * of concurrency management and the next code block will chain
          * execution of the pending work items.
          */
-       if (unlikely(cpu_intensive))
+       if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE))
                 worker_set_flags(worker, WORKER_CPU_INTENSIVE);
  
         /*
@@ -2461,9 +2516,12 @@ __acquires(&pool->lock)
  
         raw_spin_lock_irq(&pool->lock);
  
-       /* clear cpu intensive status */
-       if (unlikely(cpu_intensive))
-               worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
+       /*
+        * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
+        * CPU intensive by wq_worker_tick() if @work hogged CPU longer than
+        * wq_cpu_intensive_thresh_us. Clear it.
+        */
+       worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
  
         /* tag the worker for identification in schedule() */
         worker->last_func = worker->current_func;
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h

index c2455be7b4c2367dd3b5db0ecff848b9906b6207..6b1d66e2826962bfb864b4abc1d68212cf7cca6f 100644 (file)
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -31,6 +31,7 @@ struct worker {
         struct work_struct      *current_work;  /* K: work being processed and its */
         work_func_t             current_func;   /* K: function */
         struct pool_workqueue   *current_pwq;   /* K: pwq */
+       u64                     current_at;     /* K: runtime at start or last wakeup */
         unsigned int            current_color;  /* K: color */
  
         int                     sleeping;       /* S: is worker sleeping? */
@@ -76,6 +77,7 @@ static inline struct worker *current_wq_worker(void)
   */
  void wq_worker_running(struct task_struct *task);
  void wq_worker_sleeping(struct task_struct *task);
+void wq_worker_tick(struct task_struct *task);
  work_func_t wq_worker_last_func(struct task_struct *task);
  
  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
diff --git a/tools/workqueue/wq_monitor.py b/tools/workqueue/wq_monitor.py

index fc1643ba06b37a3b72ec9adb7a7074fff574dd1b..7c6f523b9164fa4f9cc43dbd405b5446fe55da55 100644 (file)
--- a/tools/workqueue/wq_monitor.py
+++ b/tools/workqueue/wq_monitor.py
@@ -11,6 +11,11 @@ https://github.com/osandov/drgn.
  
    infl     The number of currently in-flight work items.
  
+  CPUitsv  The number of times a concurrency-managed work item hogged CPU
+           longer than the threshold (workqueue.cpu_intensive_thresh_us)
+           and got excluded from concurrency management to avoid stalling
+           other work items.
+
    CMwake   The number of concurrency-management wake-ups while executing a
             work item of the workqueue.
  
@@ -53,6 +58,7 @@ WQ_MEM_RECLAIM          = prog['WQ_MEM_RECLAIM']
  
  PWQ_STAT_STARTED        = prog['PWQ_STAT_STARTED']      # work items started execution
  PWQ_STAT_COMPLETED      = prog['PWQ_STAT_COMPLETED']   # work items completed execution
+PWQ_STAT_CPU_INTENSIVE  = prog['PWQ_STAT_CPU_INTENSIVE'] # wq_cpu_intensive_thresh_us violations
  PWQ_STAT_CM_WAKEUP      = prog['PWQ_STAT_CM_WAKEUP']    # concurrency-management worker wakeups
  PWQ_STAT_MAYDAY         = prog['PWQ_STAT_MAYDAY']      # maydays to rescuer
  PWQ_STAT_RESCUED        = prog['PWQ_STAT_RESCUED']     # linked work items executed by rescuer
@@ -75,19 +81,23 @@ class WqStats:
                   'mem_reclaim'          : self.mem_reclaim,
                   'started'              : self.stats[PWQ_STAT_STARTED],
                   'completed'            : self.stats[PWQ_STAT_COMPLETED],
+                 'cpu_intensive'        : self.stats[PWQ_STAT_CPU_INTENSIVE],
                   'cm_wakeup'            : self.stats[PWQ_STAT_CM_WAKEUP],
                   'mayday'               : self.stats[PWQ_STAT_MAYDAY],
                   'rescued'              : self.stats[PWQ_STAT_RESCUED], }
  
      def table_header_str():
-        return f'{"":>24} {"total":>8} {"infl":>5} {"CMwake":>7} {"mayday":>7} {"rescued":>7}'
+        return f'{"":>24} {"total":>8} {"infl":>5} '\
+            f'{"CPUitsv":>7} {"CMwake":>7} {"mayday":>7} {"rescued":>7}'
  
      def table_row_str(self):
+        cpu_intensive = '-'
          cm_wakeup = '-'
          mayday = '-'
          rescued = '-'
  
          if not self.unbound:
+            cpu_intensive = str(self.stats[PWQ_STAT_CPU_INTENSIVE])
              cm_wakeup = str(self.stats[PWQ_STAT_CM_WAKEUP])
  
          if self.mem_reclaim:
@@ -97,6 +107,7 @@ class WqStats:
          out = f'{self.name[-24:]:24} ' \
                f'{self.stats[PWQ_STAT_STARTED]:8} ' \
                f'{max(self.stats[PWQ_STAT_STARTED] - self.stats[PWQ_STAT_COMPLETED], 0):5} ' \
+              f'{cpu_intensive:>7} ' \
                f'{cm_wakeup:>7} ' \
                f'{mayday:>7} ' \
                f'{rescued:>7} '
author	Tejun Heo <tj@kernel.org>
	Thu, 18 May 2023 03:02:08 +0000 (17:02 -1000)
committer	Tejun Heo <tj@kernel.org>
	Thu, 18 May 2023 03:02:08 +0000 (17:02 -1000)
Documentation/admin-guide/kernel-parameters.txt		patch \| blob \| history
Documentation/core-api/workqueue.rst		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/workqueue.c		patch \| blob \| history
kernel/workqueue_internal.h		patch \| blob \| history
tools/workqueue/wq_monitor.py		patch \| blob \| history