From 9546b29e4a6ad6ed7924dd7980975c8e675740a3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 7 Aug 2023 15:57:25 -1000
Subject: [PATCH] workqueue: Add workqueue_attrs->__pod_cpumask

workqueue_attrs has two uses:

* to specify the required unouned workqueue properties by users

* to match worker_pool's properties to workqueues by core code

For example, if the user wants to restrict a workqueue to run only CPUs 0
and 2, and the two CPUs are on different affinity scopes, the workqueue's
attrs->cpumask would contains CPUs 0 and 2, and the workqueue would be
associated with two worker_pools, one with attrs->cpumask containing just
CPU 0 and the other CPU 2.

Workqueue wants to support non-strict affinity scopes where work items are
started in their matching affinity scopes but the scheduler is free to
migrate them outside the starting scopes, which can enable utilizing the
whole machine while maintaining most of the locality benefits from affinity
scopes.

To enable that, worker_pools need to distinguish the strict affinity that it
has to follow (because that's the restriction coming from the user) and the
soft affinity that it wants to apply when dispatching work items. Note that
two worker_pools with different soft dispatching requirements have to be
separate; otherwise, for example, we'd be ping-ponging worker threads across
NUMA boundaries constantly.

This patch adds workqueue_attrs->__pod_cpumask. The new field is double
underscored as it's only used internally to distinguish worker_pools. A
worker_pool's ->cpumask is now always the same as the online subset of
allowed CPUs of the associated workqueues, and ->__pod_cpumask is the pod's
subset of that ->cpumask. Going back to the example above, both worker_pools
would have ->cpumask containing both CPUs 0 and 2 but one's ->__pod_cpumask
would contain 0 while the other's 2.

* pool_allowed_cpus() is added. It returns the worker_pool's strict cpumask
  that the pool's workers must stay within. This is currently always
  ->__pod_cpumask as all boundaries are still strict.

* As a workqueue_attrs can now track both the associated workqueues' cpumask
  and its per-pod subset, wq_calc_pod_cpumask() no longer needs an external
  out-argument. Drop @cpumask and instead store the result in
  ->__pod_cpumask.

* The above also simplifies apply_wqattrs_prepare() as the same
  workqueue_attrs can be used to create all pods associated with a
  workqueue. tmp_attrs is dropped.

* wq_update_pod() is updated to use wqattrs_equal() to test whether a pwq
  update is needed instead of only comparing ->cpumask so that
  ->__pod_cpumask is compared too. It can directly compare ->__pod_cpumaks
  but the code is easier to understand and more robust this way.

The only user-visible behavior change is that two workqueues with different
cpumasks no longer can share worker_pools even when their pod subsets
coincide. Going back to the example, let's say there's another workqueue
with cpumask 0, 2, 3, where 2 and 3 are in the same pod. It would be mapped
to two worker_pools - one with CPU 0, the other with 2 and 3. The former has
the same cpumask as the first pod of the earlier example and would have
shared the same worker_pool but that's no longer the case after this patch.
The worker_pools would have the same ->__pod_cpumask but their ->cpumask's
wouldn't match.

While this is necessary to support non-strict affinity scopes, there can be
further optimizations to maintain sharing among strict affinity scopes.
However, non-strict affinity scopes are going to be preferable for most use
cases and we don't see very diverse mixture of unbound workqueue cpumasks
anyway, so the additional overhead doesn't seem to justify the extra
complexity.

v2: - wq_update_pod() was incorrectly comparing target_attrs->__pod_cpumask
      to pool->attrs->cpumask instead of its ->__pod_cpumask. Fix it by
      using wqattrs_equal() for comparison instead.

    - Per-cpu worker pools weren't initializing ->__pod_cpumask which caused
      a subtle problem later on. Set it to cpumask_of(cpu) like ->cpumask.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 16 ++++++++++
 kernel/workqueue.c        | 74 +++++++++++++++++++++++------------------------
 2 files changed, 53 insertions(+), 37 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 568cfbc..fe53976 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -150,9 +150,25 @@ struct workqueue_attrs {
 
 	/**
 	 * @cpumask: allowed CPUs
+	 *
+	 * Work items in this workqueue are affine to these CPUs and not allowed
+	 * to execute on other CPUs. A pool serving a workqueue must have the
+	 * same @cpumask.
 	 */
 	cpumask_var_t cpumask;
 
+	/**
+	 * @__pod_cpumask: internal attribute used to create per-pod pools
+	 *
+	 * Internal use only.
+	 *
+	 * Per-pod unbound worker pools are used to improve locality. Always a
+	 * subset of ->cpumask. A workqueue can be associated with multiple
+	 * worker pools with disjoint @__pod_cpumask's. Whether the enforcement
+	 * of a pool's @__pod_cpumask is strict depends on @affn_strict.
+	 */
+	cpumask_var_t __pod_cpumask;
+
 	/*
 	 * Below fields aren't properties of a worker_pool. They only modify how
 	 * :c:func:`apply_workqueue_attrs` select pools and thus don't
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e941fa0..e61b429 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -366,7 +366,6 @@ static bool wq_online;			/* can kworkers be created yet? */
 
 /* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */
 static struct workqueue_attrs *wq_update_pod_attrs_buf;
-static cpumask_var_t wq_update_pod_cpumask_buf;
 
 static DEFINE_MUTEX(wq_pool_mutex);	/* protects pools and workqueues list */
 static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
@@ -2050,6 +2049,11 @@ static struct worker *alloc_worker(int node)
 	return worker;
 }
 
+static cpumask_t *pool_allowed_cpus(struct worker_pool *pool)
+{
+	return pool->attrs->__pod_cpumask;
+}
+
 /**
  * worker_attach_to_pool() - attach a worker to a pool
  * @worker: worker to be attached
@@ -2075,7 +2079,7 @@ static void worker_attach_to_pool(struct worker *worker,
 		kthread_set_per_cpu(worker->task, pool->cpu);
 
 	if (worker->rescue_wq)
-		set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+		set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool));
 
 	list_add_tail(&worker->node, &pool->workers);
 	worker->pool = pool;
@@ -2167,7 +2171,7 @@ static struct worker *create_worker(struct worker_pool *pool)
 	}
 
 	set_user_nice(worker->task, pool->attrs->nice);
-	kthread_bind_mask(worker->task, pool->attrs->cpumask);
+	kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
 
 	/* successful, attach the worker to the pool */
 	worker_attach_to_pool(worker, pool);
@@ -3672,6 +3676,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs)
 {
 	if (attrs) {
 		free_cpumask_var(attrs->cpumask);
+		free_cpumask_var(attrs->__pod_cpumask);
 		kfree(attrs);
 	}
 }
@@ -3693,6 +3698,8 @@ struct workqueue_attrs *alloc_workqueue_attrs(void)
 		goto fail;
 	if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL))
 		goto fail;
+	if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL))
+		goto fail;
 
 	cpumask_copy(attrs->cpumask, cpu_possible_mask);
 	attrs->affn_scope = wq_affn_dfl;
@@ -3707,6 +3714,7 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
 {
 	to->nice = from->nice;
 	cpumask_copy(to->cpumask, from->cpumask);
+	cpumask_copy(to->__pod_cpumask, from->__pod_cpumask);
 
 	/*
 	 * Unlike hash and equality test, copying shouldn't ignore wq-only
@@ -3735,6 +3743,8 @@ static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
 	hash = jhash_1word(attrs->nice, hash);
 	hash = jhash(cpumask_bits(attrs->cpumask),
 		     BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
+	hash = jhash(cpumask_bits(attrs->__pod_cpumask),
+		     BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
 	return hash;
 }
 
@@ -3746,6 +3756,8 @@ static bool wqattrs_equal(const struct workqueue_attrs *a,
 		return false;
 	if (!cpumask_equal(a->cpumask, b->cpumask))
 		return false;
+	if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask))
+		return false;
 	return true;
 }
 
@@ -3998,9 +4010,9 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
 		}
 	}
 
-	/* If cpumask is contained inside a NUMA pod, that's our NUMA node */
+	/* If __pod_cpumask is contained inside a NUMA pod, that's our node */
 	for (pod = 0; pod < pt->nr_pods; pod++) {
-		if (cpumask_subset(attrs->cpumask, pt->pod_cpus[pod])) {
+		if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) {
 			node = pt->pod_node[pod];
 			break;
 		}
@@ -4190,11 +4202,10 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
  * @attrs: the wq_attrs of the default pwq of the target workqueue
  * @cpu: the target CPU
  * @cpu_going_down: if >= 0, the CPU to consider as offline
- * @cpumask: outarg, the resulting cpumask
  *
  * Calculate the cpumask a workqueue with @attrs should use on @pod. If
  * @cpu_going_down is >= 0, that cpu is considered offline during calculation.
- * The result is stored in @cpumask.
+ * The result is stored in @attrs->__pod_cpumask.
  *
  * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled
  * and @pod has online CPUs requested by @attrs, the returned cpumask is the
@@ -4202,27 +4213,27 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
  *
  * The caller is responsible for ensuring that the cpumask of @pod stays stable.
  */
-static void wq_calc_pod_cpumask(const struct workqueue_attrs *attrs, int cpu,
-				int cpu_going_down, cpumask_t *cpumask)
+static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu,
+				int cpu_going_down)
 {
 	const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
 	int pod = pt->cpu_pod[cpu];
 
 	/* does @pod have any online CPUs @attrs wants? */
-	cpumask_and(cpumask, pt->pod_cpus[pod], attrs->cpumask);
-	cpumask_and(cpumask, cpumask, cpu_online_mask);
+	cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask);
+	cpumask_and(attrs->__pod_cpumask, attrs->__pod_cpumask, cpu_online_mask);
 	if (cpu_going_down >= 0)
-		cpumask_clear_cpu(cpu_going_down, cpumask);
+		cpumask_clear_cpu(cpu_going_down, attrs->__pod_cpumask);
 
-	if (cpumask_empty(cpumask)) {
-		cpumask_copy(cpumask, attrs->cpumask);
+	if (cpumask_empty(attrs->__pod_cpumask)) {
+		cpumask_copy(attrs->__pod_cpumask, attrs->cpumask);
 		return;
 	}
 
 	/* yeap, return possible CPUs in @pod that @attrs wants */
-	cpumask_and(cpumask, attrs->cpumask, pt->pod_cpus[pod]);
+	cpumask_and(attrs->__pod_cpumask, attrs->cpumask, pt->pod_cpus[pod]);
 
-	if (cpumask_empty(cpumask))
+	if (cpumask_empty(attrs->__pod_cpumask))
 		pr_warn_once("WARNING: workqueue cpumask: online intersect > "
 				"possible intersect\n");
 }
@@ -4276,7 +4287,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 		      const cpumask_var_t unbound_cpumask)
 {
 	struct apply_wqattrs_ctx *ctx;
-	struct workqueue_attrs *new_attrs, *tmp_attrs;
+	struct workqueue_attrs *new_attrs;
 	int cpu;
 
 	lockdep_assert_held(&wq_pool_mutex);
@@ -4288,8 +4299,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 	ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL);
 
 	new_attrs = alloc_workqueue_attrs();
-	tmp_attrs = alloc_workqueue_attrs();
-	if (!ctx || !new_attrs || !tmp_attrs)
+	if (!ctx || !new_attrs)
 		goto out_free;
 
 	/*
@@ -4299,23 +4309,18 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 	 */
 	copy_workqueue_attrs(new_attrs, attrs);
 	wqattrs_actualize_cpumask(new_attrs, unbound_cpumask);
+	cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
 	ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
 	if (!ctx->dfl_pwq)
 		goto out_free;
 
-	/*
-	 * We may create multiple pwqs with differing cpumasks. Make a copy of
-	 * @new_attrs which will be modified and used to obtain pools.
-	 */
-	copy_workqueue_attrs(tmp_attrs, new_attrs);
-
 	for_each_possible_cpu(cpu) {
 		if (new_attrs->ordered) {
 			ctx->dfl_pwq->refcnt++;
 			ctx->pwq_tbl[cpu] = ctx->dfl_pwq;
 		} else {
-			wq_calc_pod_cpumask(new_attrs, cpu, -1, tmp_attrs->cpumask);
-			ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, tmp_attrs);
+			wq_calc_pod_cpumask(new_attrs, cpu, -1);
+			ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs);
 			if (!ctx->pwq_tbl[cpu])
 				goto out_free;
 		}
@@ -4324,14 +4329,13 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 	/* save the user configured attrs and sanitize it. */
 	copy_workqueue_attrs(new_attrs, attrs);
 	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+	cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
 	ctx->attrs = new_attrs;
 
 	ctx->wq = wq;
-	free_workqueue_attrs(tmp_attrs);
 	return ctx;
 
 out_free:
-	free_workqueue_attrs(tmp_attrs);
 	free_workqueue_attrs(new_attrs);
 	apply_wqattrs_cleanup(ctx);
 	return ERR_PTR(-ENOMEM);
@@ -4459,7 +4463,6 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu,
 	int off_cpu = online ? -1 : hotplug_cpu;
 	struct pool_workqueue *old_pwq = NULL, *pwq;
 	struct workqueue_attrs *target_attrs;
-	cpumask_t *cpumask;
 
 	lockdep_assert_held(&wq_pool_mutex);
 
@@ -4472,20 +4475,18 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu,
 	 * CPU hotplug exclusion.
 	 */
 	target_attrs = wq_update_pod_attrs_buf;
-	cpumask = wq_update_pod_cpumask_buf;
 
 	copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
 	wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask);
 
 	/* nothing to do if the target cpumask matches the current pwq */
-	wq_calc_pod_cpumask(target_attrs, cpu, off_cpu, cpumask);
+	wq_calc_pod_cpumask(target_attrs, cpu, off_cpu);
 	pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu),
 					lockdep_is_held(&wq_pool_mutex));
-	if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
+	if (wqattrs_equal(target_attrs, pwq->pool->attrs))
 		return;
 
 	/* create a new pwq */
-	cpumask_copy(target_attrs->cpumask, cpumask);
 	pwq = alloc_unbound_pwq(wq, target_attrs);
 	if (!pwq) {
 		pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n",
@@ -5409,7 +5410,7 @@ static void rebind_workers(struct worker_pool *pool)
 	for_each_pool_worker(worker, pool) {
 		kthread_set_per_cpu(worker->task, pool->cpu);
 		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
-						  pool->attrs->cpumask) < 0);
+						  pool_allowed_cpus(pool)) < 0);
 	}
 
 	raw_spin_lock_irq(&pool->lock);
@@ -6424,8 +6425,6 @@ void __init workqueue_init_early(void)
 	wq_update_pod_attrs_buf = alloc_workqueue_attrs();
 	BUG_ON(!wq_update_pod_attrs_buf);
 
-	BUG_ON(!alloc_cpumask_var(&wq_update_pod_cpumask_buf, GFP_KERNEL));
-
 	/* initialize WQ_AFFN_SYSTEM pods */
 	pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
 	pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL);
@@ -6451,6 +6450,7 @@ void __init workqueue_init_early(void)
 			BUG_ON(init_worker_pool(pool));
 			pool->cpu = cpu;
 			cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
+			cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu));
 			pool->attrs->nice = std_nice[i++];
 			pool->node = cpu_to_node(cpu);
 
-- 
2.7.4