workqueue: Report work funcs that trigger automatic CPU_INTENSIVE mechanism
authorTejun Heo <tj@kernel.org>
Thu, 18 May 2023 03:02:08 +0000 (17:02 -1000)
committerTejun Heo <tj@kernel.org>
Thu, 18 May 2023 03:02:08 +0000 (17:02 -1000)
Workqueue now automatically marks per-cpu work items that hog CPU for too
long as CPU_INTENSIVE, which excludes them from concurrency management and
prevents stalling other concurrency-managed work items. If a work function
keeps running over the thershold, it likely needs to be switched to use an
unbound workqueue.

This patch adds a debug mechanism which tracks the work functions which
trigger the automatic CPU_INTENSIVE mechanism and report them using
pr_warn() with exponential backoff.

v3: Documentation update.

v2: Drop bouncing to kthread_worker for printing messages. It was to avoid
    introducing circular locking dependency through printk but not effective
    as it still had pool lock -> wci_lock -> printk -> pool lock loop. Let's
    just print directly using printk_deferred().

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Documentation/admin-guide/kernel-parameters.txt
kernel/workqueue.c
lib/Kconfig.debug

index 1f2185c..3ed7dda 100644 (file)
                        them from noticeably delaying other per-cpu work
                        items. Default is 10000 (10ms).
 
+                       If CONFIG_WQ_CPU_INTENSIVE_REPORT is set, the kernel
+                       will report the work functions which violate this
+                       threshold repeatedly. They are likely good
+                       candidates for using WQ_UNBOUND workqueues instead.
+
        workqueue.disable_numa
                        By default, all work items queued to unbound
                        workqueues are affine to the NUMA nodes they're
index 3dc83d5..4ca6638 100644 (file)
@@ -948,6 +948,98 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
                        pool->nr_running++;
 }
 
+#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT
+
+/*
+ * Concurrency-managed per-cpu work items that hog CPU for longer than
+ * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism,
+ * which prevents them from stalling other concurrency-managed work items. If a
+ * work function keeps triggering this mechanism, it's likely that the work item
+ * should be using an unbound workqueue instead.
+ *
+ * wq_cpu_intensive_report() tracks work functions which trigger such conditions
+ * and report them so that they can be examined and converted to use unbound
+ * workqueues as appropriate. To avoid flooding the console, each violating work
+ * function is tracked and reported with exponential backoff.
+ */
+#define WCI_MAX_ENTS 128
+
+struct wci_ent {
+       work_func_t             func;
+       atomic64_t              cnt;
+       struct hlist_node       hash_node;
+};
+
+static struct wci_ent wci_ents[WCI_MAX_ENTS];
+static int wci_nr_ents;
+static DEFINE_RAW_SPINLOCK(wci_lock);
+static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS));
+
+static struct wci_ent *wci_find_ent(work_func_t func)
+{
+       struct wci_ent *ent;
+
+       hash_for_each_possible_rcu(wci_hash, ent, hash_node,
+                                  (unsigned long)func) {
+               if (ent->func == func)
+                       return ent;
+       }
+       return NULL;
+}
+
+static void wq_cpu_intensive_report(work_func_t func)
+{
+       struct wci_ent *ent;
+
+restart:
+       ent = wci_find_ent(func);
+       if (ent) {
+               u64 cnt;
+
+               /*
+                * Start reporting from the fourth time and back off
+                * exponentially.
+                */
+               cnt = atomic64_inc_return_relaxed(&ent->cnt);
+               if (cnt >= 4 && is_power_of_2(cnt))
+                       printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n",
+                                       ent->func, wq_cpu_intensive_thresh_us,
+                                       atomic64_read(&ent->cnt));
+               return;
+       }
+
+       /*
+        * @func is a new violation. Allocate a new entry for it. If wcn_ents[]
+        * is exhausted, something went really wrong and we probably made enough
+        * noise already.
+        */
+       if (wci_nr_ents >= WCI_MAX_ENTS)
+               return;
+
+       raw_spin_lock(&wci_lock);
+
+       if (wci_nr_ents >= WCI_MAX_ENTS) {
+               raw_spin_unlock(&wci_lock);
+               return;
+       }
+
+       if (wci_find_ent(func)) {
+               raw_spin_unlock(&wci_lock);
+               goto restart;
+       }
+
+       ent = &wci_ents[wci_nr_ents++];
+       ent->func = func;
+       atomic64_set(&ent->cnt, 1);
+       hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func);
+
+       raw_spin_unlock(&wci_lock);
+}
+
+#else  /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
+static void wq_cpu_intensive_report(work_func_t func) {}
+#endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
+
 /**
  * wq_worker_running - a worker is running again
  * @task: task waking up
@@ -1057,6 +1149,7 @@ void wq_worker_tick(struct task_struct *task)
        raw_spin_lock(&pool->lock);
 
        worker_set_flags(worker, WORKER_CPU_INTENSIVE);
+       wq_cpu_intensive_report(worker->current_func);
        pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;
 
        if (need_more_worker(pool)) {
index ce51d4d..97e880a 100644 (file)
@@ -1134,6 +1134,19 @@ config WQ_WATCHDOG
          state.  This can be configured through kernel parameter
          "workqueue.watchdog_thresh" and its sysfs counterpart.
 
+config WQ_CPU_INTENSIVE_REPORT
+       bool "Report per-cpu work items which hog CPU for too long"
+       depends on DEBUG_KERNEL
+       help
+         Say Y here to enable reporting of concurrency-managed per-cpu work
+         items that hog CPUs for longer than
+         workqueue.cpu_intensive_threshold_us. Workqueue automatically
+         detects and excludes them from concurrency management to prevent
+         them from stalling other per-cpu work items. Occassional
+         triggering may not necessarily indicate a problem. Repeated
+         triggering likely indicates that the work item should be switched
+         to use an unbound workqueue.
+
 config TEST_LOCKUP
        tristate "Test module to generate lockups"
        depends on m