xfs: open code end_buffer_async_write in xfs_finish_page_writeback
authorChristoph Hellwig <hch@lst.de>
Sun, 17 Sep 2017 21:07:11 +0000 (14:07 -0700)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 20 Sep 2017 06:20:02 +0000 (08:20 +0200)
commit 8353a814f2518dcfa79a5bb77afd0e7dfa391bb1 upstream.

Our loop in xfs_finish_page_writeback, which iterates over all buffer
heads in a page and then calls end_buffer_async_write, which also
iterates over all buffers in the page to check if any I/O is in flight
is not only inefficient, but also potentially dangerous as
end_buffer_async_write can cause the page and all buffers to be freed.

Replace it with a single loop that does the work of end_buffer_async_write
on a per-page basis.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
fs/xfs/xfs_aops.c

index f750d88..d23889e 100644 (file)
@@ -90,11 +90,11 @@ xfs_find_bdev_for_inode(
  * associated buffer_heads, paying attention to the start and end offsets that
  * we need to process on the page.
  *
- * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
- * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
- * the page at all, as we may be racing with memory reclaim and it can free both
- * the bufferhead chain and the page as it will see the page as clean and
- * unused.
+ * Note that we open code the action in end_buffer_async_write here so that we
+ * only have to iterate over the buffers attached to the page once.  This is not
+ * only more efficient, but also ensures that we only calls end_page_writeback
+ * at the end of the iteration, and thus avoids the pitfall of having the page
+ * and buffers potentially freed after every call to end_buffer_async_write.
  */
 static void
 xfs_finish_page_writeback(
@@ -102,29 +102,45 @@ xfs_finish_page_writeback(
        struct bio_vec          *bvec,
        int                     error)
 {
-       unsigned int            end = bvec->bv_offset + bvec->bv_len - 1;
-       struct buffer_head      *head, *bh, *next;
+       struct buffer_head      *head = page_buffers(bvec->bv_page), *bh = head;
+       bool                    busy = false;
        unsigned int            off = 0;
-       unsigned int            bsize;
+       unsigned long           flags;
 
        ASSERT(bvec->bv_offset < PAGE_SIZE);
        ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
-       ASSERT(end < PAGE_SIZE);
+       ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
        ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
 
-       bh = head = page_buffers(bvec->bv_page);
-
-       bsize = bh->b_size;
+       local_irq_save(flags);
+       bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
        do {
-               if (off > end)
-                       break;
-               next = bh->b_this_page;
-               if (off < bvec->bv_offset)
-                       goto next_bh;
-               bh->b_end_io(bh, !error);
-next_bh:
-               off += bsize;
-       } while ((bh = next) != head);
+               if (off >= bvec->bv_offset &&
+                   off < bvec->bv_offset + bvec->bv_len) {
+                       ASSERT(buffer_async_write(bh));
+                       ASSERT(bh->b_end_io == NULL);
+
+                       if (error) {
+                               mapping_set_error(bvec->bv_page->mapping, -EIO);
+                               set_buffer_write_io_error(bh);
+                               clear_buffer_uptodate(bh);
+                               SetPageError(bvec->bv_page);
+                       } else {
+                               set_buffer_uptodate(bh);
+                       }
+                       clear_buffer_async_write(bh);
+                       unlock_buffer(bh);
+               } else if (buffer_async_write(bh)) {
+                       ASSERT(buffer_locked(bh));
+                       busy = true;
+               }
+               off += bh->b_size;
+       } while ((bh = bh->b_this_page) != head);
+       bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+       local_irq_restore(flags);
+
+       if (!busy)
+               end_page_writeback(bvec->bv_page);
 }
 
 /*
@@ -138,8 +154,10 @@ xfs_destroy_ioend(
        int                     error)
 {
        struct inode            *inode = ioend->io_inode;
-       struct bio              *last = ioend->io_bio;
-       struct bio              *bio, *next;
+       struct bio              *bio = &ioend->io_inline_bio;
+       struct bio              *last = ioend->io_bio, *next;
+       u64                     start = bio->bi_iter.bi_sector;
+       bool                    quiet = bio_flagged(bio, BIO_QUIET);
 
        for (bio = &ioend->io_inline_bio; bio; bio = next) {
                struct bio_vec  *bvec;
@@ -160,6 +178,11 @@ xfs_destroy_ioend(
 
                bio_put(bio);
        }
+
+       if (unlikely(error && !quiet)) {
+               xfs_err_ratelimited(XFS_I(inode)->i_mount,
+                       "writeback error on sector %llu", start);
+       }
 }
 
 /*
@@ -427,7 +450,8 @@ xfs_start_buffer_writeback(
        ASSERT(!buffer_delay(bh));
        ASSERT(!buffer_unwritten(bh));
 
-       mark_buffer_async_write(bh);
+       bh->b_end_io = NULL;
+       set_buffer_async_write(bh);
        set_buffer_uptodate(bh);
        clear_buffer_dirty(bh);
 }