io_uring: add splice(2) support
authorPavel Begunkov <asml.silence@gmail.com>
Mon, 24 Feb 2020 08:32:45 +0000 (11:32 +0300)
committerJens Axboe <axboe@kernel.dk>
Mon, 2 Mar 2020 21:04:37 +0000 (14:04 -0700)
Add support for splice(2).

- output file is specified as sqe->fd, so it's handled by generic code
- hash_reg_file handled by generic code as well
- len is 32bit, but should be fine
- the fd_in is registered file, when SPLICE_F_FD_IN_FIXED is set, which
is a splice flag (i.e. sqe->splice_flags).

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
fs/io_uring.c
include/uapi/linux/io_uring.h

index 1a3de73..1ef20a2 100644 (file)
@@ -76,6 +76,7 @@
 #include <linux/fadvise.h>
 #include <linux/eventpoll.h>
 #include <linux/fs_struct.h>
+#include <linux/splice.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
@@ -428,6 +429,15 @@ struct io_epoll {
        struct epoll_event              event;
 };
 
+struct io_splice {
+       struct file                     *file_out;
+       struct file                     *file_in;
+       loff_t                          off_out;
+       loff_t                          off_in;
+       u64                             len;
+       unsigned int                    flags;
+};
+
 struct io_async_connect {
        struct sockaddr_storage         address;
 };
@@ -544,6 +554,7 @@ struct io_kiocb {
                struct io_fadvise       fadvise;
                struct io_madvise       madvise;
                struct io_epoll         epoll;
+               struct io_splice        splice;
        };
 
        struct io_async_ctx             *io;
@@ -744,6 +755,11 @@ static const struct io_op_def io_op_defs[] = {
                .unbound_nonreg_file    = 1,
                .file_table             = 1,
        },
+       [IORING_OP_SPLICE] = {
+               .needs_file             = 1,
+               .hash_reg_file          = 1,
+               .unbound_nonreg_file    = 1,
+       }
 };
 
 static void io_wq_submit_work(struct io_wq_work **workptr);
@@ -758,6 +774,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 static int io_grab_files(struct io_kiocb *req);
 static void io_ring_file_ref_flush(struct fixed_file_data *data);
 static void io_cleanup_req(struct io_kiocb *req);
+static int io_file_get(struct io_submit_state *state,
+                      struct io_kiocb *req,
+                      int fd, struct file **out_file,
+                      bool fixed);
 
 static struct kmem_cache *req_cachep;
 
@@ -2404,6 +2424,77 @@ out_free:
        return ret;
 }
 
+static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+       struct io_splice* sp = &req->splice;
+       unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
+       int ret;
+
+       if (req->flags & REQ_F_NEED_CLEANUP)
+               return 0;
+
+       sp->file_in = NULL;
+       sp->off_in = READ_ONCE(sqe->splice_off_in);
+       sp->off_out = READ_ONCE(sqe->off);
+       sp->len = READ_ONCE(sqe->len);
+       sp->flags = READ_ONCE(sqe->splice_flags);
+
+       if (unlikely(sp->flags & ~valid_flags))
+               return -EINVAL;
+
+       ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in,
+                         (sp->flags & SPLICE_F_FD_IN_FIXED));
+       if (ret)
+               return ret;
+       req->flags |= REQ_F_NEED_CLEANUP;
+
+       if (!S_ISREG(file_inode(sp->file_in)->i_mode))
+               req->work.flags |= IO_WQ_WORK_UNBOUND;
+
+       return 0;
+}
+
+static bool io_splice_punt(struct file *file)
+{
+       if (get_pipe_info(file))
+               return false;
+       if (!io_file_supports_async(file))
+               return true;
+       return !(file->f_mode & O_NONBLOCK);
+}
+
+static int io_splice(struct io_kiocb *req, struct io_kiocb **nxt,
+                    bool force_nonblock)
+{
+       struct io_splice *sp = &req->splice;
+       struct file *in = sp->file_in;
+       struct file *out = sp->file_out;
+       unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
+       loff_t *poff_in, *poff_out;
+       long ret;
+
+       if (force_nonblock) {
+               if (io_splice_punt(in) || io_splice_punt(out))
+                       return -EAGAIN;
+               flags |= SPLICE_F_NONBLOCK;
+       }
+
+       poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
+       poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
+       ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
+       if (force_nonblock && ret == -EAGAIN)
+               return -EAGAIN;
+
+       io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
+       req->flags &= ~REQ_F_NEED_CLEANUP;
+
+       io_cqring_add_event(req, ret);
+       if (ret != sp->len)
+               req_set_fail_links(req);
+       io_put_req_find_next(req, nxt);
+       return 0;
+}
+
 /*
  * IORING_OP_NOP just posts a completion event, nothing else.
  */
@@ -4230,6 +4321,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
        case IORING_OP_EPOLL_CTL:
                ret = io_epoll_ctl_prep(req, sqe);
                break;
+       case IORING_OP_SPLICE:
+               ret = io_splice_prep(req, sqe);
+               break;
        default:
                printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
                                req->opcode);
@@ -4292,6 +4386,10 @@ static void io_cleanup_req(struct io_kiocb *req)
        case IORING_OP_STATX:
                putname(req->open.filename);
                break;
+       case IORING_OP_SPLICE:
+               io_put_file(req, req->splice.file_in,
+                           (req->splice.flags & SPLICE_F_FD_IN_FIXED));
+               break;
        }
 
        req->flags &= ~REQ_F_NEED_CLEANUP;
@@ -4495,6 +4593,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                }
                ret = io_epoll_ctl(req, nxt, force_nonblock);
                break;
+       case IORING_OP_SPLICE:
+               if (sqe) {
+                       ret = io_splice_prep(req, sqe);
+                       if (ret < 0)
+                               break;
+               }
+               ret = io_splice(req, nxt, force_nonblock);
+               break;
        default:
                ret = -EINVAL;
                break;
@@ -7230,6 +7336,7 @@ static int __init io_uring_init(void)
        BUILD_BUG_SQE_ELEM(8,  __u64,  off);
        BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
        BUILD_BUG_SQE_ELEM(16, __u64,  addr);
+       BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
        BUILD_BUG_SQE_ELEM(24, __u32,  len);
        BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
        BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
@@ -7244,9 +7351,11 @@ static int __init io_uring_init(void)
        BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
+       BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
        BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
        BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
        BUILD_BUG_SQE_ELEM(42, __u16,  personality);
+       BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
 
        BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
        req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
index 3f7961c..08891cc 100644 (file)
@@ -23,7 +23,10 @@ struct io_uring_sqe {
                __u64   off;    /* offset into file */
                __u64   addr2;
        };
-       __u64   addr;           /* pointer to buffer or iovecs */
+       union {
+               __u64   addr;   /* pointer to buffer or iovecs */
+               __u64   splice_off_in;
+       };
        __u32   len;            /* buffer size or number of iovecs */
        union {
                __kernel_rwf_t  rw_flags;
@@ -37,6 +40,7 @@ struct io_uring_sqe {
                __u32           open_flags;
                __u32           statx_flags;
                __u32           fadvise_advice;
+               __u32           splice_flags;
        };
        __u64   user_data;      /* data to be passed back at completion time */
        union {
@@ -45,6 +49,7 @@ struct io_uring_sqe {
                        __u16   buf_index;
                        /* personality to use, if used */
                        __u16   personality;
+                       __s32   splice_fd_in;
                };
                __u64   __pad2[3];
        };
@@ -113,6 +118,7 @@ enum {
        IORING_OP_RECV,
        IORING_OP_OPENAT2,
        IORING_OP_EPOLL_CTL,
+       IORING_OP_SPLICE,
 
        /* this goes last, obviously */
        IORING_OP_LAST,
@@ -129,6 +135,12 @@ enum {
 #define IORING_TIMEOUT_ABS     (1U << 0)
 
 /*
+ * sqe->splice_flags
+ * extends splice(2) flags
+ */
+#define SPLICE_F_FD_IN_FIXED   (1U << 31) /* the last bit of __u32 */
+
+/*
  * IO completion data structure (Completion Queue Entry)
  */
 struct io_uring_cqe {