pnfsblock: fix partial page buffer wirte
authorPeng Tao <bergwolf@gmail.com>
Thu, 23 Aug 2012 16:27:51 +0000 (00:27 +0800)
committerTrond Myklebust <Trond.Myklebust@netapp.com>
Mon, 1 Oct 2012 22:38:24 +0000 (15:38 -0700)
If applications use flock to protect its write range, generic NFS
will not do read-modify-write cycle at page cache level. Therefore
LD should know how to handle non-sector aligned writes. Otherwise
there will be data corruption.

Cc: stable <stable@vger.kernel.org>
Signed-off-by: Peng Tao <tao.peng@emc.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
fs/nfs/blocklayout/blocklayout.c
fs/nfs/blocklayout/blocklayout.h

index 3c61514..a9fe644 100644 (file)
@@ -162,25 +162,39 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
        return bio;
 }
 
-static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
+static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
                                      sector_t isect, struct page *page,
                                      struct pnfs_block_extent *be,
                                      void (*end_io)(struct bio *, int err),
-                                     struct parallel_io *par)
+                                     struct parallel_io *par,
+                                     unsigned int offset, int len)
 {
+       isect = isect + (offset >> SECTOR_SHIFT);
+       dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
+               npg, rw, (unsigned long long)isect, offset, len);
 retry:
        if (!bio) {
                bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
                if (!bio)
                        return ERR_PTR(-ENOMEM);
        }
-       if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+       if (bio_add_page(bio, page, len, offset) < len) {
                bio = bl_submit_bio(rw, bio);
                goto retry;
        }
        return bio;
 }
 
+static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
+                                     sector_t isect, struct page *page,
+                                     struct pnfs_block_extent *be,
+                                     void (*end_io)(struct bio *, int err),
+                                     struct parallel_io *par)
+{
+       return do_add_page_to_bio(bio, npg, rw, isect, page, be,
+                                 end_io, par, 0, PAGE_CACHE_SIZE);
+}
+
 /* This is basically copied from mpage_end_io_read */
 static void bl_end_io_read(struct bio *bio, int err)
 {
@@ -450,6 +464,106 @@ map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
        return;
 }
 
+static void
+bl_read_single_end_io(struct bio *bio, int error)
+{
+       struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+       struct page *page = bvec->bv_page;
+
+       /* Only one page in bvec */
+       unlock_page(page);
+}
+
+static int
+bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
+                   unsigned int offset, unsigned int len)
+{
+       struct bio *bio;
+       struct page *shadow_page;
+       sector_t isect;
+       char *kaddr, *kshadow_addr;
+       int ret = 0;
+
+       dprintk("%s: offset %u len %u\n", __func__, offset, len);
+
+       shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+       if (shadow_page == NULL)
+               return -ENOMEM;
+
+       bio = bio_alloc(GFP_NOIO, 1);
+       if (bio == NULL)
+               return -ENOMEM;
+
+       isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
+               (offset / SECTOR_SIZE);
+
+       bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
+       bio->bi_bdev = be->be_mdev;
+       bio->bi_end_io = bl_read_single_end_io;
+
+       lock_page(shadow_page);
+       if (bio_add_page(bio, shadow_page,
+                        SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) {
+               unlock_page(shadow_page);
+               bio_put(bio);
+               return -EIO;
+       }
+
+       submit_bio(READ, bio);
+       wait_on_page_locked(shadow_page);
+       if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) {
+               ret = -EIO;
+       } else {
+               kaddr = kmap_atomic(page);
+               kshadow_addr = kmap_atomic(shadow_page);
+               memcpy(kaddr + offset, kshadow_addr + offset, len);
+               kunmap_atomic(kshadow_addr);
+               kunmap_atomic(kaddr);
+       }
+       __free_page(shadow_page);
+       bio_put(bio);
+
+       return ret;
+}
+
+static int
+bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be,
+                         unsigned int dirty_offset, unsigned int dirty_len,
+                         bool full_page)
+{
+       int ret = 0;
+       unsigned int start, end;
+
+       if (full_page) {
+               start = 0;
+               end = PAGE_CACHE_SIZE;
+       } else {
+               start = round_down(dirty_offset, SECTOR_SIZE);
+               end = round_up(dirty_offset + dirty_len, SECTOR_SIZE);
+       }
+
+       dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len);
+       if (!be) {
+               zero_user_segments(page, start, dirty_offset,
+                                  dirty_offset + dirty_len, end);
+               if (start == 0 && end == PAGE_CACHE_SIZE &&
+                   trylock_page(page)) {
+                       SetPageUptodate(page);
+                       unlock_page(page);
+               }
+               return ret;
+       }
+
+       if (start != dirty_offset)
+               ret = bl_do_readpage_sync(page, be, start, dirty_offset - start);
+
+       if (!ret && (dirty_offset + dirty_len < end))
+               ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len,
+                                         end - dirty_offset - dirty_len);
+
+       return ret;
+}
+
 /* Given an unmapped page, zero it or read in page for COW, page is locked
  * by caller.
  */
@@ -483,7 +597,6 @@ init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
        SetPageUptodate(page);
 
 cleanup:
-       bl_put_extent(cow_read);
        if (bh)
                free_buffer_head(bh);
        if (ret) {
@@ -555,6 +668,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
        struct parallel_io *par;
        loff_t offset = wdata->args.offset;
        size_t count = wdata->args.count;
+       unsigned int pg_offset, pg_len, saved_len;
        struct page **pages = wdata->args.pages;
        struct page *page;
        pgoff_t index;
@@ -659,10 +773,11 @@ next_page:
                if (!extent_length) {
                        /* We've used up the previous extent */
                        bl_put_extent(be);
+                       bl_put_extent(cow_read);
                        bio = bl_submit_bio(WRITE, bio);
                        /* Get the next one */
                        be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
-                                            isect, NULL);
+                                            isect, &cow_read);
                        if (!be || !is_writable(be, isect)) {
                                header->pnfs_error = -EINVAL;
                                goto out;
@@ -679,7 +794,26 @@ next_page:
                        extent_length = be->be_length -
                            (isect - be->be_f_offset);
                }
-               if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+
+               dprintk("%s offset %lld count %Zu\n", __func__, offset, count);
+               pg_offset = offset & ~PAGE_CACHE_MASK;
+               if (pg_offset + count > PAGE_CACHE_SIZE)
+                       pg_len = PAGE_CACHE_SIZE - pg_offset;
+               else
+                       pg_len = count;
+
+               saved_len = pg_len;
+               if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
+                   !bl_is_sector_init(be->be_inval, isect)) {
+                       ret = bl_read_partial_page_sync(pages[i], cow_read,
+                                                       pg_offset, pg_len, true);
+                       if (ret) {
+                               dprintk("%s bl_read_partial_page_sync fail %d\n",
+                                       __func__, ret);
+                               header->pnfs_error = ret;
+                               goto out;
+                       }
+
                        ret = bl_mark_sectors_init(be->be_inval, isect,
                                                       PAGE_CACHE_SECTORS);
                        if (unlikely(ret)) {
@@ -688,15 +822,35 @@ next_page:
                                header->pnfs_error = ret;
                                goto out;
                        }
+
+                       /* Expand to full page write */
+                       pg_offset = 0;
+                       pg_len = PAGE_CACHE_SIZE;
+               } else if  ((pg_offset & (SECTOR_SIZE - 1)) ||
+                           (pg_len & (SECTOR_SIZE - 1))){
+                       /* ahh, nasty case. We have to do sync full sector
+                        * read-modify-write cycles.
+                        */
+                       unsigned int saved_offset = pg_offset;
+                       ret = bl_read_partial_page_sync(pages[i], be, pg_offset,
+                                                       pg_len, false);
+                       pg_offset = round_down(pg_offset, SECTOR_SIZE);
+                       pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE)
+                                - pg_offset;
                }
-               bio = bl_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
+
+
+               bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
                                         isect, pages[i], be,
-                                        bl_end_io_write, par);
+                                        bl_end_io_write, par,
+                                        pg_offset, pg_len);
                if (IS_ERR(bio)) {
                        header->pnfs_error = PTR_ERR(bio);
                        bio = NULL;
                        goto out;
                }
+               offset += saved_len;
+               count -= saved_len;
                isect += PAGE_CACHE_SECTORS;
                last_isect = isect;
                extent_length -= PAGE_CACHE_SECTORS;
@@ -714,17 +868,16 @@ next_page:
        }
 
 write_done:
-       wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
-       if (count < wdata->res.count) {
-               wdata->res.count = count;
-       }
+       wdata->res.count = wdata->args.count;
 out:
        bl_put_extent(be);
+       bl_put_extent(cow_read);
        bl_submit_bio(WRITE, bio);
        put_parallel(par);
        return PNFS_ATTEMPTED;
 out_mds:
        bl_put_extent(be);
+       bl_put_extent(cow_read);
        kfree(par);
        return PNFS_NOT_ATTEMPTED;
 }
index 0335069..39bb51a 100644 (file)
@@ -41,6 +41,7 @@
 
 #define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
 #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
+#define SECTOR_SIZE (1 << SECTOR_SHIFT)
 
 struct block_mount_id {
        spinlock_t                      bm_lock;    /* protects list */