iomap: support IOCB_DIO_CALLER_COMP
authorJens Axboe <axboe@kernel.dk>
Sat, 8 Jul 2023 16:01:50 +0000 (10:01 -0600)
committerJens Axboe <axboe@kernel.dk>
Tue, 1 Aug 2023 23:32:49 +0000 (17:32 -0600)
If IOCB_DIO_CALLER_COMP is set, utilize that to set kiocb->dio_complete
handler and data for that callback. Rather than punt the completion to a
workqueue, we pass back the handler and data to the issuer and will get
a callback from a safe task context.

Using the following fio job to randomly dio write 4k blocks at
queue depths of 1..16:

fio --name=dio-write --filename=/data1/file --time_based=1 \
--runtime=10 --bs=4096 --rw=randwrite --norandommap --buffered=0 \
--cpus_allowed=4 --ioengine=io_uring --iodepth=$depth

shows the following results before and after this patch:

Stock Patched Diff
=======================================
QD1 155K 162K + 4.5%
QD2 290K 313K + 7.9%
QD4 533K 597K +12.0%
QD8 604K 827K +36.9%
QD16 615K 845K +37.4%

which shows nice wins all around. If we factored in per-IOP efficiency,
the wins look even nicer. This becomes apparent as queue depth rises,
as the offloaded workqueue completions runs out of steam.

Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
fs/iomap/direct-io.c

index b943bc5..bcd3f8c 100644 (file)
@@ -20,6 +20,7 @@
  * Private flags for iomap_dio, must not overlap with the public ones in
  * iomap.h:
  */
+#define IOMAP_DIO_CALLER_COMP  (1U << 26)
 #define IOMAP_DIO_INLINE_COMP  (1U << 27)
 #define IOMAP_DIO_WRITE_THROUGH        (1U << 28)
 #define IOMAP_DIO_NEED_SYNC    (1U << 29)
@@ -132,6 +133,11 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
 }
 EXPORT_SYMBOL_GPL(iomap_dio_complete);
 
+static ssize_t iomap_dio_deferred_complete(void *data)
+{
+       return iomap_dio_complete(data);
+}
+
 static void iomap_dio_complete_work(struct work_struct *work)
 {
        struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
@@ -183,6 +189,31 @@ void iomap_dio_bio_end_io(struct bio *bio)
        }
 
        /*
+        * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
+        * our completion that way to avoid an async punt to a workqueue.
+        */
+       if (dio->flags & IOMAP_DIO_CALLER_COMP) {
+               /* only polled IO cares about private cleared */
+               iocb->private = dio;
+               iocb->dio_complete = iomap_dio_deferred_complete;
+
+               /*
+                * Invoke ->ki_complete() directly. We've assigned our
+                * dio_complete callback handler, and since the issuer set
+                * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will
+                * notice ->dio_complete being set and will defer calling that
+                * handler until it can be done from a safe task context.
+                *
+                * Note that the 'res' being passed in here is not important
+                * for this case. The actual completion value of the request
+                * will be gotten from dio_complete when that is run by the
+                * issuer.
+                */
+               iocb->ki_complete(iocb, 0);
+               goto release_bio;
+       }
+
+       /*
         * Async DIO completion that requires filesystem level completion work
         * gets punted to a work queue to complete as the operation may require
         * more IO to be issued to finalise filesystem metadata changes or
@@ -278,12 +309,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
                 * after IO completion such as unwritten extent conversion) and
                 * the underlying device either supports FUA or doesn't have
                 * a volatile write cache. This allows us to avoid cache flushes
-                * on IO completion.
+                * on IO completion. If we can't use writethrough and need to
+                * sync, disable in-task completions as dio completion will
+                * need to call generic_write_sync() which will do a blocking
+                * fsync / cache flush call.
                 */
                if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
                    (dio->flags & IOMAP_DIO_WRITE_THROUGH) &&
                    (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
                        use_fua = true;
+               else if (dio->flags & IOMAP_DIO_NEED_SYNC)
+                       dio->flags &= ~IOMAP_DIO_CALLER_COMP;
        }
 
        /*
@@ -298,10 +334,23 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
                goto out;
 
        /*
-        * We can only poll for single bio I/Os.
+        * We can only do deferred completion for pure overwrites that
+        * don't require additional IO at completion. This rules out
+        * writes that need zeroing or extent conversion, extend
+        * the file size, or issue journal IO or cache flushes
+        * during completion processing.
         */
        if (need_zeroout ||
+           ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) ||
            ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
+               dio->flags &= ~IOMAP_DIO_CALLER_COMP;
+
+       /*
+        * The rules for polled IO completions follow the guidelines as the
+        * ones we set for inline and deferred completions. If none of those
+        * are available for this IO, clear the polled flag.
+        */
+       if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP)))
                dio->iocb->ki_flags &= ~IOCB_HIPRI;
 
        if (need_zeroout) {
@@ -547,6 +596,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                iomi.flags |= IOMAP_WRITE;
                dio->flags |= IOMAP_DIO_WRITE;
 
+               /*
+                * Flag as supporting deferred completions, if the issuer
+                * groks it. This can avoid a workqueue punt for writes.
+                * We may later clear this flag if we need to do other IO
+                * as part of this IO completion.
+                */
+               if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
+                       dio->flags |= IOMAP_DIO_CALLER_COMP;
+
                if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
                        ret = -EAGAIN;
                        if (iomi.pos >= dio->i_size ||