btrfs: use alloc_ordered_workqueue() to create ordered workqueues
authorTejun Heo <tj@kernel.org>
Thu, 25 May 2023 23:33:08 +0000 (13:33 -1000)
committerDavid Sterba <dsterba@suse.com>
Mon, 19 Jun 2023 11:59:30 +0000 (13:59 +0200)
BACKGROUND
==========

When multiple work items are queued to a workqueue, their execution order
doesn't match the queueing order. They may get executed in any order and
simultaneously. When fully serialized execution - one by one in the queueing
order - is needed, an ordered workqueue should be used which can be created
with alloc_ordered_workqueue().

However, alloc_ordered_workqueue() was a later addition. Before it, an
ordered workqueue could be obtained by creating an UNBOUND workqueue with
@max_active==1. This originally was an implementation side-effect which was
broken by 4c16bd327c74 ("workqueue: restore WQ_UNBOUND/max_active==1 to be
ordered"). Because there were users that depended on the ordered execution,
5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered")
made workqueue allocation path to implicitly promote UNBOUND workqueues w/
@max_active==1 to ordered workqueues.

While this has worked okay, overloading the UNBOUND allocation interface
this way creates other issues. It's difficult to tell whether a given
workqueue actually needs to be ordered and users that legitimately want a
min concurrency level wq unexpectedly gets an ordered one instead. With
planned UNBOUND workqueue updates to improve execution locality and more
prevalence of chiplet designs which can benefit from such improvements, this
isn't a state we wanna be in forever.

This patch series audits all call sites that create an UNBOUND workqueue w/
@max_active==1 and converts them to alloc_ordered_workqueue() as necessary.

BTRFS
=====

* fs_info->scrub_workers initialized in scrub_workers_get() was setting
  @max_active to 1 when @is_dev_replace is set and it seems that the
  workqueue actually needs to be ordered if @is_dev_replace. Update the code
  so that alloc_ordered_workqueue() is used if @is_dev_replace.

* fs_info->discard_ctl.discard_workers initialized in
  btrfs_init_workqueues() was directly using alloc_workqueue() w/
  @max_active==1. Converted to alloc_ordered_workqueue().

* fs_info->fixup_workers and fs_info->qgroup_rescan_workers initialized in
  btrfs_queue_work() use the btrfs's workqueue wrapper, btrfs_workqueue,
  which are allocated with btrfs_alloc_workqueue().

  btrfs_workqueue implements automatic @max_active adjustment which is
  disabled when the specified max limit is below a certain threshold, so
  calling btrfs_alloc_workqueue() with @limit_active==1 yields an ordered
  workqueue whose @max_active won't be changed as the auto-tuning is
  disabled.

  This is rather brittle in that nothing clearly indicates that the two
  workqueues should be ordered or btrfs_alloc_workqueue() must disable
  auto-tuning when @limit_active==1.

  This patch factors out the common btrfs_workqueue init code into
  btrfs_init_workqueue() and add explicit btrfs_alloc_ordered_workqueue().
  The two workqueues are converted to use the new ordered allocation
  interface.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/async-thread.c
fs/btrfs/async-thread.h
fs/btrfs/disk-io.c
fs/btrfs/scrub.c

index aac2404..ce083e9 100644 (file)
@@ -71,6 +71,16 @@ bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq)
        return atomic_read(&wq->pending) > wq->thresh * 2;
 }
 
+static void btrfs_init_workqueue(struct btrfs_workqueue *wq,
+                                struct btrfs_fs_info *fs_info)
+{
+       wq->fs_info = fs_info;
+       atomic_set(&wq->pending, 0);
+       INIT_LIST_HEAD(&wq->ordered_list);
+       spin_lock_init(&wq->list_lock);
+       spin_lock_init(&wq->thres_lock);
+}
+
 struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
                                              const char *name, unsigned int flags,
                                              int limit_active, int thresh)
@@ -80,9 +90,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
        if (!ret)
                return NULL;
 
-       ret->fs_info = fs_info;
+       btrfs_init_workqueue(ret, fs_info);
+
        ret->limit_active = limit_active;
-       atomic_set(&ret->pending, 0);
        if (thresh == 0)
                thresh = DFT_THRESHOLD;
        /* For low threshold, disabling threshold is a better choice */
@@ -106,9 +116,33 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
                return NULL;
        }
 
-       INIT_LIST_HEAD(&ret->ordered_list);
-       spin_lock_init(&ret->list_lock);
-       spin_lock_init(&ret->thres_lock);
+       trace_btrfs_workqueue_alloc(ret, name);
+       return ret;
+}
+
+struct btrfs_workqueue *btrfs_alloc_ordered_workqueue(
+                               struct btrfs_fs_info *fs_info, const char *name,
+                               unsigned int flags)
+{
+       struct btrfs_workqueue *ret;
+
+       ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+       if (!ret)
+               return NULL;
+
+       btrfs_init_workqueue(ret, fs_info);
+
+       /* Ordered workqueues don't allow @max_active adjustments. */
+       ret->limit_active = 1;
+       ret->current_active = 1;
+       ret->thresh = NO_THRESHOLD;
+
+       ret->normal_wq = alloc_ordered_workqueue("btrfs-%s", flags, name);
+       if (!ret->normal_wq) {
+               kfree(ret);
+               return NULL;
+       }
+
        trace_btrfs_workqueue_alloc(ret, name);
        return ret;
 }
index 6e2596d..30f66c5 100644 (file)
@@ -31,6 +31,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
                                              unsigned int flags,
                                              int limit_active,
                                              int thresh);
+struct btrfs_workqueue *btrfs_alloc_ordered_workqueue(
+                               struct btrfs_fs_info *fs_info, const char *name,
+                               unsigned int flags);
 void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
                     btrfs_func_t ordered_func, btrfs_func_t ordered_free);
 void btrfs_queue_work(struct btrfs_workqueue *wq,
index 06dab9d..1ad7037 100644 (file)
@@ -1939,6 +1939,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 {
        u32 max_active = fs_info->thread_pool_size;
        unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
+       unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE;
 
        fs_info->workers =
                btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
@@ -1955,7 +1956,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
                btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
 
        fs_info->fixup_workers =
-               btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
+               btrfs_alloc_ordered_workqueue(fs_info, "fixup", ordered_flags);
 
        fs_info->endio_workers =
                alloc_workqueue("btrfs-endio", flags, max_active);
@@ -1974,9 +1975,10 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
                btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
                                      max_active, 0);
        fs_info->qgroup_rescan_workers =
-               btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
+               btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan",
+                                             ordered_flags);
        fs_info->discard_ctl.discard_workers =
-               alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
+               alloc_ordered_workqueue("btrfs_discard", WQ_FREEZABLE);
 
        if (!(fs_info->workers &&
              fs_info->delalloc_workers && fs_info->flush_workers &&
index d7a1445..316c5a0 100644 (file)
@@ -2740,8 +2740,10 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
        if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
                return 0;
 
-       scrub_workers = alloc_workqueue("btrfs-scrub", flags,
-                                       is_dev_replace ? 1 : max_active);
+       if (is_dev_replace)
+               scrub_workers = alloc_ordered_workqueue("btrfs-scrub", flags);
+       else
+               scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active);
        if (!scrub_workers)
                goto fail_scrub_workers;