Merge tag 'cxl-for-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl
[platform/kernel/linux-rpi.git] / kernel / workqueue.c
index c913e33..02a8f40 100644 (file)
@@ -126,6 +126,12 @@ enum {
  *    cpu or grabbing pool->lock is enough for read access.  If
  *    POOL_DISASSOCIATED is set, it's identical to L.
  *
+ * K: Only modified by worker while holding pool->lock. Can be safely read by
+ *    self, while holding pool->lock or from IRQ context if %current is the
+ *    kworker.
+ *
+ * S: Only modified by worker self.
+ *
  * A: wq_pool_attach_mutex protected.
  *
  * PL: wq_pool_mutex protected.
@@ -200,6 +206,22 @@ struct worker_pool {
 };
 
 /*
+ * Per-pool_workqueue statistics. These can be monitored using
+ * tools/workqueue/wq_monitor.py.
+ */
+enum pool_workqueue_stats {
+       PWQ_STAT_STARTED,       /* work items started execution */
+       PWQ_STAT_COMPLETED,     /* work items completed execution */
+       PWQ_STAT_CPU_TIME,      /* total CPU time consumed */
+       PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */
+       PWQ_STAT_CM_WAKEUP,     /* concurrency-management worker wakeups */
+       PWQ_STAT_MAYDAY,        /* maydays to rescuer */
+       PWQ_STAT_RESCUED,       /* linked work items executed by rescuer */
+
+       PWQ_NR_STATS,
+};
+
+/*
  * The per-pool workqueue.  While queued, the lower WORK_STRUCT_FLAG_BITS
  * of work_struct->data are used for flags and the remaining high bits
  * point to the pwq; thus, pwqs need to be aligned at two's power of the
@@ -236,6 +258,8 @@ struct pool_workqueue {
        struct list_head        pwqs_node;      /* WR: node on wq->pwqs */
        struct list_head        mayday_node;    /* MD: node on wq->maydays */
 
+       u64                     stats[PWQ_NR_STATS];
+
        /*
         * Release of unbound pwq is punted to system_wq.  See put_pwq()
         * and pwq_unbound_release_workfn() for details.  pool_workqueue
@@ -310,6 +334,14 @@ static struct kmem_cache *pwq_cache;
 static cpumask_var_t *wq_numa_possible_cpumask;
                                        /* possible CPUs of each node */
 
+/*
+ * Per-cpu work items which run for longer than the following threshold are
+ * automatically considered CPU intensive and excluded from concurrency
+ * management to prevent them from noticeably delaying other per-cpu work items.
+ */
+static unsigned long wq_cpu_intensive_thresh_us = 10000;
+module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
+
 static bool wq_disable_numa;
 module_param_named(disable_numa, wq_disable_numa, bool, 0444);
 
@@ -867,6 +899,152 @@ static void wake_up_worker(struct worker_pool *pool)
 }
 
 /**
+ * worker_set_flags - set worker flags and adjust nr_running accordingly
+ * @worker: self
+ * @flags: flags to set
+ *
+ * Set @flags in @worker->flags and adjust nr_running accordingly.
+ *
+ * CONTEXT:
+ * raw_spin_lock_irq(pool->lock)
+ */
+static inline void worker_set_flags(struct worker *worker, unsigned int flags)
+{
+       struct worker_pool *pool = worker->pool;
+
+       WARN_ON_ONCE(worker->task != current);
+
+       /* If transitioning into NOT_RUNNING, adjust nr_running. */
+       if ((flags & WORKER_NOT_RUNNING) &&
+           !(worker->flags & WORKER_NOT_RUNNING)) {
+               pool->nr_running--;
+       }
+
+       worker->flags |= flags;
+}
+
+/**
+ * worker_clr_flags - clear worker flags and adjust nr_running accordingly
+ * @worker: self
+ * @flags: flags to clear
+ *
+ * Clear @flags in @worker->flags and adjust nr_running accordingly.
+ *
+ * CONTEXT:
+ * raw_spin_lock_irq(pool->lock)
+ */
+static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
+{
+       struct worker_pool *pool = worker->pool;
+       unsigned int oflags = worker->flags;
+
+       WARN_ON_ONCE(worker->task != current);
+
+       worker->flags &= ~flags;
+
+       /*
+        * If transitioning out of NOT_RUNNING, increment nr_running.  Note
+        * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
+        * of multiple flags, not a single flag.
+        */
+       if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
+               if (!(worker->flags & WORKER_NOT_RUNNING))
+                       pool->nr_running++;
+}
+
+#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT
+
+/*
+ * Concurrency-managed per-cpu work items that hog CPU for longer than
+ * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism,
+ * which prevents them from stalling other concurrency-managed work items. If a
+ * work function keeps triggering this mechanism, it's likely that the work item
+ * should be using an unbound workqueue instead.
+ *
+ * wq_cpu_intensive_report() tracks work functions which trigger such conditions
+ * and report them so that they can be examined and converted to use unbound
+ * workqueues as appropriate. To avoid flooding the console, each violating work
+ * function is tracked and reported with exponential backoff.
+ */
+#define WCI_MAX_ENTS 128
+
+struct wci_ent {
+       work_func_t             func;
+       atomic64_t              cnt;
+       struct hlist_node       hash_node;
+};
+
+static struct wci_ent wci_ents[WCI_MAX_ENTS];
+static int wci_nr_ents;
+static DEFINE_RAW_SPINLOCK(wci_lock);
+static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS));
+
+static struct wci_ent *wci_find_ent(work_func_t func)
+{
+       struct wci_ent *ent;
+
+       hash_for_each_possible_rcu(wci_hash, ent, hash_node,
+                                  (unsigned long)func) {
+               if (ent->func == func)
+                       return ent;
+       }
+       return NULL;
+}
+
+static void wq_cpu_intensive_report(work_func_t func)
+{
+       struct wci_ent *ent;
+
+restart:
+       ent = wci_find_ent(func);
+       if (ent) {
+               u64 cnt;
+
+               /*
+                * Start reporting from the fourth time and back off
+                * exponentially.
+                */
+               cnt = atomic64_inc_return_relaxed(&ent->cnt);
+               if (cnt >= 4 && is_power_of_2(cnt))
+                       printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n",
+                                       ent->func, wq_cpu_intensive_thresh_us,
+                                       atomic64_read(&ent->cnt));
+               return;
+       }
+
+       /*
+        * @func is a new violation. Allocate a new entry for it. If wcn_ents[]
+        * is exhausted, something went really wrong and we probably made enough
+        * noise already.
+        */
+       if (wci_nr_ents >= WCI_MAX_ENTS)
+               return;
+
+       raw_spin_lock(&wci_lock);
+
+       if (wci_nr_ents >= WCI_MAX_ENTS) {
+               raw_spin_unlock(&wci_lock);
+               return;
+       }
+
+       if (wci_find_ent(func)) {
+               raw_spin_unlock(&wci_lock);
+               goto restart;
+       }
+
+       ent = &wci_ents[wci_nr_ents++];
+       ent->func = func;
+       atomic64_set(&ent->cnt, 1);
+       hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func);
+
+       raw_spin_unlock(&wci_lock);
+}
+
+#else  /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
+static void wq_cpu_intensive_report(work_func_t func) {}
+#endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
+
+/**
  * wq_worker_running - a worker is running again
  * @task: task waking up
  *
@@ -876,7 +1054,7 @@ void wq_worker_running(struct task_struct *task)
 {
        struct worker *worker = kthread_data(task);
 
-       if (!worker->sleeping)
+       if (!READ_ONCE(worker->sleeping))
                return;
 
        /*
@@ -889,7 +1067,14 @@ void wq_worker_running(struct task_struct *task)
        if (!(worker->flags & WORKER_NOT_RUNNING))
                worker->pool->nr_running++;
        preempt_enable();
-       worker->sleeping = 0;
+
+       /*
+        * CPU intensive auto-detection cares about how long a work item hogged
+        * CPU without sleeping. Reset the starting timestamp on wakeup.
+        */
+       worker->current_at = worker->task->se.sum_exec_runtime;
+
+       WRITE_ONCE(worker->sleeping, 0);
 }
 
 /**
@@ -915,10 +1100,10 @@ void wq_worker_sleeping(struct task_struct *task)
        pool = worker->pool;
 
        /* Return if preempted before wq_worker_running() was reached */
-       if (worker->sleeping)
+       if (READ_ONCE(worker->sleeping))
                return;
 
-       worker->sleeping = 1;
+       WRITE_ONCE(worker->sleeping, 1);
        raw_spin_lock_irq(&pool->lock);
 
        /*
@@ -932,12 +1117,66 @@ void wq_worker_sleeping(struct task_struct *task)
        }
 
        pool->nr_running--;
-       if (need_more_worker(pool))
+       if (need_more_worker(pool)) {
+               worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++;
                wake_up_worker(pool);
+       }
        raw_spin_unlock_irq(&pool->lock);
 }
 
 /**
+ * wq_worker_tick - a scheduler tick occurred while a kworker is running
+ * @task: task currently running
+ *
+ * Called from scheduler_tick(). We're in the IRQ context and the current
+ * worker's fields which follow the 'K' locking rule can be accessed safely.
+ */
+void wq_worker_tick(struct task_struct *task)
+{
+       struct worker *worker = kthread_data(task);
+       struct pool_workqueue *pwq = worker->current_pwq;
+       struct worker_pool *pool = worker->pool;
+
+       if (!pwq)
+               return;
+
+       pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC;
+
+       if (!wq_cpu_intensive_thresh_us)
+               return;
+
+       /*
+        * If the current worker is concurrency managed and hogged the CPU for
+        * longer than wq_cpu_intensive_thresh_us, it's automatically marked
+        * CPU_INTENSIVE to avoid stalling other concurrency-managed work items.
+        *
+        * Set @worker->sleeping means that @worker is in the process of
+        * switching out voluntarily and won't be contributing to
+        * @pool->nr_running until it wakes up. As wq_worker_sleeping() also
+        * decrements ->nr_running, setting CPU_INTENSIVE here can lead to
+        * double decrements. The task is releasing the CPU anyway. Let's skip.
+        * We probably want to make this prettier in the future.
+        */
+       if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) ||
+           worker->task->se.sum_exec_runtime - worker->current_at <
+           wq_cpu_intensive_thresh_us * NSEC_PER_USEC)
+               return;
+
+       raw_spin_lock(&pool->lock);
+
+       worker_set_flags(worker, WORKER_CPU_INTENSIVE);
+       wq_cpu_intensive_report(worker->current_func);
+       pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;
+
+       if (need_more_worker(pool)) {
+               pwq->stats[PWQ_STAT_CM_WAKEUP]++;
+               wake_up_worker(pool);
+       }
+
+       raw_spin_unlock(&pool->lock);
+}
+
+/**
  * wq_worker_last_func - retrieve worker's last work function
  * @task: Task to retrieve last work function of.
  *
@@ -969,60 +1208,6 @@ work_func_t wq_worker_last_func(struct task_struct *task)
 }
 
 /**
- * worker_set_flags - set worker flags and adjust nr_running accordingly
- * @worker: self
- * @flags: flags to set
- *
- * Set @flags in @worker->flags and adjust nr_running accordingly.
- *
- * CONTEXT:
- * raw_spin_lock_irq(pool->lock)
- */
-static inline void worker_set_flags(struct worker *worker, unsigned int flags)
-{
-       struct worker_pool *pool = worker->pool;
-
-       WARN_ON_ONCE(worker->task != current);
-
-       /* If transitioning into NOT_RUNNING, adjust nr_running. */
-       if ((flags & WORKER_NOT_RUNNING) &&
-           !(worker->flags & WORKER_NOT_RUNNING)) {
-               pool->nr_running--;
-       }
-
-       worker->flags |= flags;
-}
-
-/**
- * worker_clr_flags - clear worker flags and adjust nr_running accordingly
- * @worker: self
- * @flags: flags to clear
- *
- * Clear @flags in @worker->flags and adjust nr_running accordingly.
- *
- * CONTEXT:
- * raw_spin_lock_irq(pool->lock)
- */
-static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
-{
-       struct worker_pool *pool = worker->pool;
-       unsigned int oflags = worker->flags;
-
-       WARN_ON_ONCE(worker->task != current);
-
-       worker->flags &= ~flags;
-
-       /*
-        * If transitioning out of NOT_RUNNING, increment nr_running.  Note
-        * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
-        * of multiple flags, not a single flag.
-        */
-       if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
-               if (!(worker->flags & WORKER_NOT_RUNNING))
-                       pool->nr_running++;
-}
-
-/**
  * find_worker_executing_work - find worker which is executing a work
  * @pool: pool of interest
  * @work: work to find worker for
@@ -1542,6 +1727,8 @@ out:
  * We queue the work to a specific CPU, the caller must ensure it
  * can't go away.  Callers that fail to ensure that the specified
  * CPU cannot go away will execute on a randomly chosen CPU.
+ * But note well that callers specifying a CPU that never has been
+ * online will get a splat.
  *
  * Return: %false if @work was already on a queue, %true otherwise.
  */
@@ -2166,6 +2353,7 @@ static void send_mayday(struct work_struct *work)
                get_pwq(pwq);
                list_add_tail(&pwq->mayday_node, &wq->maydays);
                wake_up_process(wq->rescuer->task);
+               pwq->stats[PWQ_STAT_MAYDAY]++;
        }
 }
 
@@ -2303,7 +2491,6 @@ __acquires(&pool->lock)
 {
        struct pool_workqueue *pwq = get_work_pwq(work);
        struct worker_pool *pool = worker->pool;
-       bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
        unsigned long work_data;
        struct worker *collision;
 #ifdef CONFIG_LOCKDEP
@@ -2340,6 +2527,7 @@ __acquires(&pool->lock)
        worker->current_work = work;
        worker->current_func = work->func;
        worker->current_pwq = pwq;
+       worker->current_at = worker->task->se.sum_exec_runtime;
        work_data = *work_data_bits(work);
        worker->current_color = get_work_color(work_data);
 
@@ -2357,7 +2545,7 @@ __acquires(&pool->lock)
         * of concurrency management and the next code block will chain
         * execution of the pending work items.
         */
-       if (unlikely(cpu_intensive))
+       if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE))
                worker_set_flags(worker, WORKER_CPU_INTENSIVE);
 
        /*
@@ -2404,6 +2592,7 @@ __acquires(&pool->lock)
         * workqueues), so hiding them isn't a problem.
         */
        lockdep_invariant_state(true);
+       pwq->stats[PWQ_STAT_STARTED]++;
        trace_workqueue_execute_start(work);
        worker->current_func(work);
        /*
@@ -2411,6 +2600,7 @@ __acquires(&pool->lock)
         * point will only record its address.
         */
        trace_workqueue_execute_end(work, worker->current_func);
+       pwq->stats[PWQ_STAT_COMPLETED]++;
        lock_map_release(&lockdep_map);
        lock_map_release(&pwq->wq->lockdep_map);
 
@@ -2435,9 +2625,12 @@ __acquires(&pool->lock)
 
        raw_spin_lock_irq(&pool->lock);
 
-       /* clear cpu intensive status */
-       if (unlikely(cpu_intensive))
-               worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
+       /*
+        * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
+        * CPU intensive by wq_worker_tick() if @work hogged CPU longer than
+        * wq_cpu_intensive_thresh_us. Clear it.
+        */
+       worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
 
        /* tag the worker for identification in schedule() */
        worker->last_func = worker->current_func;
@@ -2654,6 +2847,7 @@ repeat:
                                if (first)
                                        pool->watchdog_ts = jiffies;
                                move_linked_works(work, scheduled, &n);
+                               pwq->stats[PWQ_STAT_RESCUED]++;
                        }
                        first = false;
                }