new iov_iter flavour - ITER_UBUF
authorAl Viro <viro@zeniv.linux.org.uk>
Sun, 22 May 2022 18:59:25 +0000 (14:59 -0400)
committerAl Viro <viro@zeniv.linux.org.uk>
Tue, 9 Aug 2022 02:37:15 +0000 (22:37 -0400)
Equivalent of single-segment iovec.  Initialized by iov_iter_ubuf(),
checked for by iter_is_ubuf(), otherwise behaves like ITER_IOVEC
ones.

We are going to expose the things like ->write_iter() et.al. to those
in subsequent commits.

New predicate (user_backed_iter()) that is true for ITER_IOVEC and
ITER_UBUF; places like direct-IO handling should use that for
checking that pages we modify after getting them from iov_iter_get_pages()
would need to be dirtied.

DO NOT assume that replacing iter_is_iovec() with user_backed_iter()
will solve all problems - there's code that uses iter_is_iovec() to
decide how to poke around in iov_iter guts and for that the predicate
replacement obviously won't suffice.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
12 files changed:
block/fops.c
fs/ceph/file.c
fs/cifs/file.c
fs/direct-io.c
fs/fuse/dev.c
fs/fuse/file.c
fs/gfs2/file.c
fs/iomap/direct-io.c
fs/nfs/direct.c
include/linux/uio.h
lib/iov_iter.c
mm/shmem.c

index a564cd8..b907425 100644 (file)
@@ -75,7 +75,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 
        if (iov_iter_rw(iter) == READ) {
                bio_init(&bio, bdev, vecs, nr_pages, REQ_OP_READ);
-               if (iter_is_iovec(iter))
+               if (user_backed_iter(iter))
                        should_dirty = true;
        } else {
                bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb));
@@ -204,7 +204,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
        }
 
        dio->size = 0;
-       if (is_read && iter_is_iovec(iter))
+       if (is_read && user_backed_iter(iter))
                dio->flags |= DIO_SHOULD_DIRTY;
 
        blk_start_plug(&plug);
@@ -335,7 +335,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
        dio->size = bio->bi_iter.bi_size;
 
        if (is_read) {
-               if (iter_is_iovec(iter)) {
+               if (user_backed_iter(iter)) {
                        dio->flags |= DIO_SHOULD_DIRTY;
                        bio_set_pages_dirty(bio);
                }
index da59e83..c535de5 100644 (file)
@@ -1262,7 +1262,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
        size_t count = iov_iter_count(iter);
        loff_t pos = iocb->ki_pos;
        bool write = iov_iter_rw(iter) == WRITE;
-       bool should_dirty = !write && iter_is_iovec(iter);
+       bool should_dirty = !write && user_backed_iter(iter);
 
        if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
                return -EROFS;
index e64cda7..e1e05b2 100644 (file)
@@ -4004,7 +4004,7 @@ static ssize_t __cifs_readv(
        if (!is_sync_kiocb(iocb))
                ctx->iocb = iocb;
 
-       if (iter_is_iovec(to))
+       if (user_backed_iter(to))
                ctx->should_dirty = true;
 
        if (direct) {
index df5e2d0..c7fc01c 100644 (file)
@@ -1251,7 +1251,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
        spin_lock_init(&dio->bio_lock);
        dio->refcount = 1;
 
-       dio->should_dirty = iter_is_iovec(iter) && iov_iter_rw(iter) == READ;
+       dio->should_dirty = user_backed_iter(iter) && iov_iter_rw(iter) == READ;
        sdio.iter = iter;
        sdio.final_block_in_request = end >> blkbits;
 
index 0e537e5..8d657c2 100644 (file)
@@ -1356,7 +1356,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to)
        if (!fud)
                return -EPERM;
 
-       if (!iter_is_iovec(to))
+       if (!user_backed_iter(to))
                return -EINVAL;
 
        fuse_copy_init(&cs, 1, to);
@@ -1949,7 +1949,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
        if (!fud)
                return -EPERM;
 
-       if (!iter_is_iovec(from))
+       if (!user_backed_iter(from))
                return -EINVAL;
 
        fuse_copy_init(&cs, 0, from);
index 00fa861..c982e3a 100644 (file)
@@ -1465,7 +1465,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
                        inode_unlock(inode);
        }
 
-       io->should_dirty = !write && iter_is_iovec(iter);
+       io->should_dirty = !write && user_backed_iter(iter);
        while (count) {
                ssize_t nres;
                fl_owner_t owner = current->files;
index 2cceb19..48e6cc7 100644 (file)
@@ -780,7 +780,7 @@ static inline bool should_fault_in_pages(struct iov_iter *i,
 
        if (!count)
                return false;
-       if (!iter_is_iovec(i))
+       if (!user_backed_iter(i))
                return false;
 
        size = PAGE_SIZE;
index c75d33d..4eb559a 100644 (file)
@@ -533,7 +533,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                        iomi.flags |= IOMAP_NOWAIT;
                }
 
-               if (iter_is_iovec(iter))
+               if (user_backed_iter(iter))
                        dio->flags |= IOMAP_DIO_DIRTY;
        } else {
                iomi.flags |= IOMAP_WRITE;
index 4eb2a83..022e1ce 100644 (file)
@@ -478,7 +478,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
        if (!is_sync_kiocb(iocb))
                dreq->iocb = iocb;
 
-       if (iter_is_iovec(iter))
+       if (user_backed_iter(iter))
                dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
 
        if (!swap)
index 9a2dc49..85bef84 100644 (file)
@@ -26,6 +26,7 @@ enum iter_type {
        ITER_PIPE,
        ITER_XARRAY,
        ITER_DISCARD,
+       ITER_UBUF,
 };
 
 struct iov_iter_state {
@@ -38,6 +39,7 @@ struct iov_iter {
        u8 iter_type;
        bool nofault;
        bool data_source;
+       bool user_backed;
        size_t iov_offset;
        size_t count;
        union {
@@ -46,6 +48,7 @@ struct iov_iter {
                const struct bio_vec *bvec;
                struct xarray *xarray;
                struct pipe_inode_info *pipe;
+               void __user *ubuf;
        };
        union {
                unsigned long nr_segs;
@@ -70,6 +73,11 @@ static inline void iov_iter_save_state(struct iov_iter *iter,
        state->nr_segs = iter->nr_segs;
 }
 
+static inline bool iter_is_ubuf(const struct iov_iter *i)
+{
+       return iov_iter_type(i) == ITER_UBUF;
+}
+
 static inline bool iter_is_iovec(const struct iov_iter *i)
 {
        return iov_iter_type(i) == ITER_IOVEC;
@@ -105,6 +113,11 @@ static inline unsigned char iov_iter_rw(const struct iov_iter *i)
        return i->data_source ? WRITE : READ;
 }
 
+static inline bool user_backed_iter(const struct iov_iter *i)
+{
+       return i->user_backed;
+}
+
 /*
  * Total number of bytes covered by an iovec.
  *
@@ -322,4 +335,17 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec,
 int import_single_range(int type, void __user *buf, size_t len,
                 struct iovec *iov, struct iov_iter *i);
 
+static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
+                       void __user *buf, size_t count)
+{
+       WARN_ON(direction & ~(READ | WRITE));
+       *i = (struct iov_iter) {
+               .iter_type = ITER_UBUF,
+               .user_backed = true,
+               .data_source = direction,
+               .ubuf = buf,
+               .count = count
+       };
+}
+
 #endif
index 0e0be33..b3493d2 100644 (file)
 
 #define PIPE_PARANOIA /* for now */
 
+/* covers ubuf and kbuf alike */
+#define iterate_buf(i, n, base, len, off, __p, STEP) {         \
+       size_t __maybe_unused off = 0;                          \
+       len = n;                                                \
+       base = __p + i->iov_offset;                             \
+       len -= (STEP);                                          \
+       i->iov_offset += len;                                   \
+       n = len;                                                \
+}
+
 /* covers iovec and kvec alike */
 #define iterate_iovec(i, n, base, len, off, __p, STEP) {       \
        size_t off = 0;                                         \
@@ -110,7 +120,12 @@ __out:                                                             \
        if (unlikely(i->count < n))                             \
                n = i->count;                                   \
        if (likely(n)) {                                        \
-               if (likely(iter_is_iovec(i))) {                 \
+               if (likely(iter_is_ubuf(i))) {                  \
+                       void __user *base;                      \
+                       size_t len;                             \
+                       iterate_buf(i, n, base, len, off,       \
+                                               i->ubuf, (I))   \
+               } else if (likely(iter_is_iovec(i))) {          \
                        const struct iovec *iov = i->iov;       \
                        void __user *base;                      \
                        size_t len;                             \
@@ -275,7 +290,11 @@ out:
  */
 size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
 {
-       if (iter_is_iovec(i)) {
+       if (iter_is_ubuf(i)) {
+               size_t n = min(size, iov_iter_count(i));
+               n -= fault_in_readable(i->ubuf + i->iov_offset, n);
+               return size - n;
+       } else if (iter_is_iovec(i)) {
                size_t count = min(size, iov_iter_count(i));
                const struct iovec *p;
                size_t skip;
@@ -314,7 +333,11 @@ EXPORT_SYMBOL(fault_in_iov_iter_readable);
  */
 size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
 {
-       if (iter_is_iovec(i)) {
+       if (iter_is_ubuf(i)) {
+               size_t n = min(size, iov_iter_count(i));
+               n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n);
+               return size - n;
+       } else if (iter_is_iovec(i)) {
                size_t count = min(size, iov_iter_count(i));
                const struct iovec *p;
                size_t skip;
@@ -345,6 +368,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
        *i = (struct iov_iter) {
                .iter_type = ITER_IOVEC,
                .nofault = false,
+               .user_backed = true,
                .data_source = direction,
                .iov = iov,
                .nr_segs = nr_segs,
@@ -494,7 +518,7 @@ size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 {
        if (unlikely(iov_iter_is_pipe(i)))
                return copy_pipe_to_iter(addr, bytes, i);
-       if (iter_is_iovec(i))
+       if (user_backed_iter(i))
                might_fault();
        iterate_and_advance(i, bytes, base, len, off,
                copyout(base, addr + off, len),
@@ -583,7 +607,7 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 {
        if (unlikely(iov_iter_is_pipe(i)))
                return copy_mc_pipe_to_iter(addr, bytes, i);
-       if (iter_is_iovec(i))
+       if (user_backed_iter(i))
                might_fault();
        __iterate_and_advance(i, bytes, base, len, off,
                copyout_mc(base, addr + off, len),
@@ -601,7 +625,7 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
                WARN_ON(1);
                return 0;
        }
-       if (iter_is_iovec(i))
+       if (user_backed_iter(i))
                might_fault();
        iterate_and_advance(i, bytes, base, len, off,
                copyin(addr + off, base, len),
@@ -894,16 +918,16 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
 {
        if (unlikely(i->count < size))
                size = i->count;
-       if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
+       if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) {
+               i->iov_offset += size;
+               i->count -= size;
+       } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
                /* iovec and kvec have identical layouts */
                iov_iter_iovec_advance(i, size);
        } else if (iov_iter_is_bvec(i)) {
                iov_iter_bvec_advance(i, size);
        } else if (iov_iter_is_pipe(i)) {
                pipe_advance(i, size);
-       } else if (unlikely(iov_iter_is_xarray(i))) {
-               i->iov_offset += size;
-               i->count -= size;
        } else if (iov_iter_is_discard(i)) {
                i->count -= size;
        }
@@ -950,7 +974,7 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll)
                return;
        }
        unroll -= i->iov_offset;
-       if (iov_iter_is_xarray(i)) {
+       if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) {
                BUG(); /* We should never go beyond the start of the specified
                        * range since we might then be straying into pages that
                        * aren't pinned.
@@ -1158,6 +1182,14 @@ static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask,
 bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
                         unsigned len_mask)
 {
+       if (likely(iter_is_ubuf(i))) {
+               if (i->count & len_mask)
+                       return false;
+               if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask)
+                       return false;
+               return true;
+       }
+
        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                return iov_iter_aligned_iovec(i, addr_mask, len_mask);
 
@@ -1233,6 +1265,13 @@ static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
 
 unsigned long iov_iter_alignment(const struct iov_iter *i)
 {
+       if (likely(iter_is_ubuf(i))) {
+               size_t size = i->count;
+               if (size)
+                       return ((unsigned long)i->ubuf + i->iov_offset) | size;
+               return 0;
+       }
+
        /* iovec and kvec have identical layouts */
        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                return iov_iter_alignment_iovec(i);
@@ -1263,6 +1302,9 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
        size_t size = i->count;
        unsigned k;
 
+       if (iter_is_ubuf(i))
+               return 0;
+
        if (WARN_ON(!iter_is_iovec(i)))
                return ~0U;
 
@@ -1385,12 +1427,15 @@ static ssize_t iter_xarray_get_pages(struct iov_iter *i,
        return min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
 }
 
-/* must be done on non-empty ITER_IOVEC one */
+/* must be done on non-empty ITER_UBUF or ITER_IOVEC one */
 static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size)
 {
        size_t skip;
        long k;
 
+       if (iter_is_ubuf(i))
+               return (unsigned long)i->ubuf + i->iov_offset;
+
        for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
                size_t len = i->iov[k].iov_len - skip;
 
@@ -1432,7 +1477,7 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
        if (maxsize > MAX_RW_COUNT)
                maxsize = MAX_RW_COUNT;
 
-       if (likely(iter_is_iovec(i))) {
+       if (likely(user_backed_iter(i))) {
                unsigned int gup_flags = 0;
                unsigned long addr;
 
@@ -1559,7 +1604,7 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
        if (maxsize > MAX_RW_COUNT)
                maxsize = MAX_RW_COUNT;
 
-       if (likely(iter_is_iovec(i))) {
+       if (likely(user_backed_iter(i))) {
                unsigned int gup_flags = 0;
                unsigned long addr;
 
@@ -1715,6 +1760,11 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
 {
        if (unlikely(!i->count))
                return 0;
+       if (likely(iter_is_ubuf(i))) {
+               unsigned offs = offset_in_page(i->ubuf + i->iov_offset);
+               int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE);
+               return min(npages, maxpages);
+       }
        /* iovec and kvec have identical layouts */
        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                return iov_npages(i, maxpages);
@@ -1749,17 +1799,16 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
                WARN_ON(1);
                return NULL;
        }
-       if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new)))
-               return NULL;
        if (iov_iter_is_bvec(new))
                return new->bvec = kmemdup(new->bvec,
                                    new->nr_segs * sizeof(struct bio_vec),
                                    flags);
-       else
+       else if (iov_iter_is_kvec(new) || iter_is_iovec(new))
                /* iovec and kvec have identical layout */
                return new->iov = kmemdup(new->iov,
                                   new->nr_segs * sizeof(struct iovec),
                                   flags);
+       return NULL;
 }
 EXPORT_SYMBOL(dup_iter);
 
@@ -1953,10 +2002,12 @@ EXPORT_SYMBOL(import_single_range);
 void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
 {
        if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) &&
-                        !iov_iter_is_kvec(i))
+                        !iov_iter_is_kvec(i) && !iter_is_ubuf(i))
                return;
        i->iov_offset = state->iov_offset;
        i->count = state->count;
+       if (iter_is_ubuf(i))
+               return;
        /*
         * For the *vec iters, nr_segs + iov is constant - if we increment
         * the vec, then we also decrement the nr_segs count. Hence we don't
index e5e43b9..e3a7e17 100644 (file)
@@ -2602,7 +2602,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
                        ret = copy_page_to_iter(page, offset, nr, to);
                        put_page(page);
 
-               } else if (iter_is_iovec(to)) {
+               } else if (user_backed_iter(to)) {
                        /*
                         * Copy to user tends to be so well optimized, but
                         * clear_user() not so much, that it is noticeably