ceph: fix iov_iter issues in ceph_direct_read_write()
authorIlya Dryomov <idryomov@gmail.com>
Fri, 4 May 2018 14:57:31 +0000 (16:57 +0200)
committerIlya Dryomov <idryomov@gmail.com>
Thu, 10 May 2018 08:15:12 +0000 (10:15 +0200)
dio_get_pagev_size() and dio_get_pages_alloc() introduced in commit
b5b98989dc7e ("ceph: combine as many iovec as possile into one OSD
request") assume that the passed iov_iter is ITER_IOVEC.  This isn't
the case with splice where it ends up poking into the guts of ITER_BVEC
or ITER_PIPE iterators, causing lockups and crashes easily reproduced
with generic/095.

Rather than trying to figure out gap alignment and stuff pages into
a page vector, add a helper for going from iov_iter to a bio_vec array
and make use of the new CEPH_OSD_DATA_TYPE_BVECS code.

Fixes: b5b98989dc7e ("ceph: combine as many iovec as possile into one OSD request")
Link: http://tracker.ceph.com/issues/18130
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Tested-by: Luis Henriques <lhenriques@suse.com>
fs/ceph/file.c

index 8ce7849..cf0e45b 100644 (file)
@@ -70,69 +70,104 @@ static __le32 ceph_flags_sys2wire(u32 flags)
  */
 
 /*
- * Calculate the length sum of direct io vectors that can
- * be combined into one page vector.
+ * How many pages to get in one call to iov_iter_get_pages().  This
+ * determines the size of the on-stack array used as a buffer.
  */
-static size_t dio_get_pagev_size(const struct iov_iter *it)
+#define ITER_GET_BVECS_PAGES   64
+
+static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize,
+                               struct bio_vec *bvecs)
 {
-    const struct iovec *iov = it->iov;
-    const struct iovec *iovend = iov + it->nr_segs;
-    size_t size;
-
-    size = iov->iov_len - it->iov_offset;
-    /*
-     * An iov can be page vectored when both the current tail
-     * and the next base are page aligned.
-     */
-    while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
-           (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
-        size += iov->iov_len;
-    }
-    dout("dio_get_pagevlen len = %zu\n", size);
-    return size;
+       size_t size = 0;
+       int bvec_idx = 0;
+
+       if (maxsize > iov_iter_count(iter))
+               maxsize = iov_iter_count(iter);
+
+       while (size < maxsize) {
+               struct page *pages[ITER_GET_BVECS_PAGES];
+               ssize_t bytes;
+               size_t start;
+               int idx = 0;
+
+               bytes = iov_iter_get_pages(iter, pages, maxsize - size,
+                                          ITER_GET_BVECS_PAGES, &start);
+               if (bytes < 0)
+                       return size ?: bytes;
+
+               iov_iter_advance(iter, bytes);
+               size += bytes;
+
+               for ( ; bytes; idx++, bvec_idx++) {
+                       struct bio_vec bv = {
+                               .bv_page = pages[idx],
+                               .bv_len = min_t(int, bytes, PAGE_SIZE - start),
+                               .bv_offset = start,
+                       };
+
+                       bvecs[bvec_idx] = bv;
+                       bytes -= bv.bv_len;
+                       start = 0;
+               }
+       }
+
+       return size;
 }
 
 /*
- * Allocate a page vector based on (@it, @nbytes).
- * The return value is the tuple describing a page vector,
- * that is (@pages, @page_align, @num_pages).
+ * iov_iter_get_pages() only considers one iov_iter segment, no matter
+ * what maxsize or maxpages are given.  For ITER_BVEC that is a single
+ * page.
+ *
+ * Attempt to get up to @maxsize bytes worth of pages from @iter.
+ * Return the number of bytes in the created bio_vec array, or an error.
  */
-static struct page **
-dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
-                   size_t *page_align, int *num_pages)
+static ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize,
+                                   struct bio_vec **bvecs, int *num_bvecs)
 {
-       struct iov_iter tmp_it = *it;
-       size_t align;
-       struct page **pages;
-       int ret = 0, idx, npages;
+       struct bio_vec *bv;
+       size_t orig_count = iov_iter_count(iter);
+       ssize_t bytes;
+       int npages;
 
-       align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
-               (PAGE_SIZE - 1);
-       npages = calc_pages_for(align, nbytes);
-       pages = kvmalloc(sizeof(*pages) * npages, GFP_KERNEL);
-       if (!pages)
-               return ERR_PTR(-ENOMEM);
+       iov_iter_truncate(iter, maxsize);
+       npages = iov_iter_npages(iter, INT_MAX);
+       iov_iter_reexpand(iter, orig_count);
 
-       for (idx = 0; idx < npages; ) {
-               size_t start;
-               ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes,
-                                        npages - idx, &start);
-               if (ret < 0)
-                       goto fail;
+       /*
+        * __iter_get_bvecs() may populate only part of the array -- zero it
+        * out.
+        */
+       bv = kvmalloc_array(npages, sizeof(*bv), GFP_KERNEL | __GFP_ZERO);
+       if (!bv)
+               return -ENOMEM;
 
-               iov_iter_advance(&tmp_it, ret);
-               nbytes -= ret;
-               idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE;
+       bytes = __iter_get_bvecs(iter, maxsize, bv);
+       if (bytes < 0) {
+               /*
+                * No pages were pinned -- just free the array.
+                */
+               kvfree(bv);
+               return bytes;
        }
 
-       BUG_ON(nbytes != 0);
-       *num_pages = npages;
-       *page_align = align;
-       dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align);
-       return pages;
-fail:
-       ceph_put_page_vector(pages, idx, false);
-       return ERR_PTR(ret);
+       *bvecs = bv;
+       *num_bvecs = npages;
+       return bytes;
+}
+
+static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty)
+{
+       int i;
+
+       for (i = 0; i < num_bvecs; i++) {
+               if (bvecs[i].bv_page) {
+                       if (should_dirty)
+                               set_page_dirty_lock(bvecs[i].bv_page);
+                       put_page(bvecs[i].bv_page);
+               }
+       }
+       kvfree(bvecs);
 }
 
 /*
@@ -746,11 +781,12 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
        struct inode *inode = req->r_inode;
        struct ceph_aio_request *aio_req = req->r_priv;
        struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
-       int num_pages = calc_pages_for((u64)osd_data->alignment,
-                                      osd_data->length);
 
-       dout("ceph_aio_complete_req %p rc %d bytes %llu\n",
-            inode, rc, osd_data->length);
+       BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
+       BUG_ON(!osd_data->num_bvecs);
+
+       dout("ceph_aio_complete_req %p rc %d bytes %u\n",
+            inode, rc, osd_data->bvec_pos.iter.bi_size);
 
        if (rc == -EOLDSNAPC) {
                struct ceph_aio_work *aio_work;
@@ -768,9 +804,10 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
        } else if (!aio_req->write) {
                if (rc == -ENOENT)
                        rc = 0;
-               if (rc >= 0 && osd_data->length > rc) {
-                       int zoff = osd_data->alignment + rc;
-                       int zlen = osd_data->length - rc;
+               if (rc >= 0 && osd_data->bvec_pos.iter.bi_size > rc) {
+                       struct iov_iter i;
+                       int zlen = osd_data->bvec_pos.iter.bi_size - rc;
+
                        /*
                         * If read is satisfied by single OSD request,
                         * it can pass EOF. Otherwise read is within
@@ -785,13 +822,16 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
                                aio_req->total_len = rc + zlen;
                        }
 
-                       if (zlen > 0)
-                               ceph_zero_page_vector_range(zoff, zlen,
-                                                           osd_data->pages);
+                       iov_iter_bvec(&i, ITER_BVEC, osd_data->bvec_pos.bvecs,
+                                     osd_data->num_bvecs,
+                                     osd_data->bvec_pos.iter.bi_size);
+                       iov_iter_advance(&i, rc);
+                       iov_iter_zero(zlen, &i);
                }
        }
 
-       ceph_put_page_vector(osd_data->pages, num_pages, aio_req->should_dirty);
+       put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs,
+                 aio_req->should_dirty);
        ceph_osdc_put_request(req);
 
        if (rc < 0)
@@ -879,7 +919,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_vino vino;
        struct ceph_osd_request *req;
-       struct page **pages;
+       struct bio_vec *bvecs;
        struct ceph_aio_request *aio_req = NULL;
        int num_pages = 0;
        int flags;
@@ -914,8 +954,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
        }
 
        while (iov_iter_count(iter) > 0) {
-               u64 size = dio_get_pagev_size(iter);
-               size_t start = 0;
+               u64 size = iov_iter_count(iter);
                ssize_t len;
 
                if (write)
@@ -938,13 +977,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                        break;
                }
 
-               len = size;
-               pages = dio_get_pages_alloc(iter, len, &start, &num_pages);
-               if (IS_ERR(pages)) {
+               len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages);
+               if (len < 0) {
                        ceph_osdc_put_request(req);
-                       ret = PTR_ERR(pages);
+                       ret = len;
                        break;
                }
+               if (len != size)
+                       osd_req_op_extent_update(req, 0, len);
 
                /*
                 * To simplify error handling, allow AIO when IO within i_size
@@ -977,8 +1017,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                        req->r_mtime = mtime;
                }
 
-               osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
-                                                false, false);
+               osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
 
                if (aio_req) {
                        aio_req->total_len += len;
@@ -991,7 +1030,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                        list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);
 
                        pos += len;
-                       iov_iter_advance(iter, len);
                        continue;
                }
 
@@ -1004,25 +1042,26 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                        if (ret == -ENOENT)
                                ret = 0;
                        if (ret >= 0 && ret < len && pos + ret < size) {
+                               struct iov_iter i;
                                int zlen = min_t(size_t, len - ret,
                                                 size - pos - ret);
-                               ceph_zero_page_vector_range(start + ret, zlen,
-                                                           pages);
+
+                               iov_iter_bvec(&i, ITER_BVEC, bvecs, num_pages,
+                                             len);
+                               iov_iter_advance(&i, ret);
+                               iov_iter_zero(zlen, &i);
                                ret += zlen;
                        }
                        if (ret >= 0)
                                len = ret;
                }
 
-               ceph_put_page_vector(pages, num_pages, should_dirty);
-
+               put_bvecs(bvecs, num_pages, should_dirty);
                ceph_osdc_put_request(req);
                if (ret < 0)
                        break;
 
                pos += len;
-               iov_iter_advance(iter, len);
-
                if (!write && pos >= size)
                        break;