xfs: fix COW writeback race
authorChristoph Hellwig <hch@lst.de>
Fri, 20 Jan 2017 17:31:54 +0000 (09:31 -0800)
committerDarrick J. Wong <darrick.wong@oracle.com>
Mon, 23 Jan 2017 18:55:07 +0000 (10:55 -0800)
Due to the way how xfs_iomap_write_allocate tries to convert the whole
found extents from delalloc to real space we can run into a race
condition with multiple threads doing writes to this same extent.
For the non-COW case that is harmless as the only thing that can happen
is that we call xfs_bmapi_write on an extent that has already been
converted to a real allocation.  For COW writes where we move the extent
from the COW to the data fork after I/O completion the race is, however,
not quite as harmless.  In the worst case we are now calling
xfs_bmapi_write on a region that contains hole in the COW work, which
will trip up an assert in debug builds or lead to file system corruption
in non-debug builds.  This seems to be reproducible with workloads of
small O_DSYNC write, although so far I've not managed to come up with
a with an isolated reproducer.

The fix for the issue is relatively simple:  tell xfs_bmapi_write
that we are only asked to convert delayed allocations and skip holes
in that case.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
fs/xfs/libxfs/xfs_bmap.c
fs/xfs/libxfs/xfs_bmap.h
fs/xfs/xfs_iomap.c

index 44773c9..ab82dd4 100644 (file)
@@ -4514,8 +4514,6 @@ xfs_bmapi_write(
        int                     n;              /* current extent index */
        xfs_fileoff_t           obno;           /* old block number (offset) */
        int                     whichfork;      /* data or attr fork */
-       char                    inhole;         /* current location is hole in file */
-       char                    wasdelay;       /* old extent was delayed */
 
 #ifdef DEBUG
        xfs_fileoff_t           orig_bno;       /* original block number value */
@@ -4603,22 +4601,44 @@ xfs_bmapi_write(
        bma.firstblock = firstblock;
 
        while (bno < end && n < *nmap) {
-               inhole = eof || bma.got.br_startoff > bno;
-               wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
+               bool                    need_alloc = false, wasdelay = false;
 
-               /*
-                * Make sure we only reflink into a hole.
-                */
-               if (flags & XFS_BMAPI_REMAP)
-                       ASSERT(inhole);
-               if (flags & XFS_BMAPI_COWFORK)
-                       ASSERT(!inhole);
+               /* in hole or beyoned EOF? */
+               if (eof || bma.got.br_startoff > bno) {
+                       if (flags & XFS_BMAPI_DELALLOC) {
+                               /*
+                                * For the COW fork we can reasonably get a
+                                * request for converting an extent that races
+                                * with other threads already having converted
+                                * part of it, as there converting COW to
+                                * regular blocks is not protected using the
+                                * IOLOCK.
+                                */
+                               ASSERT(flags & XFS_BMAPI_COWFORK);
+                               if (!(flags & XFS_BMAPI_COWFORK)) {
+                                       error = -EIO;
+                                       goto error0;
+                               }
+
+                               if (eof || bno >= end)
+                                       break;
+                       } else {
+                               need_alloc = true;
+                       }
+               } else {
+                       /*
+                        * Make sure we only reflink into a hole.
+                        */
+                       ASSERT(!(flags & XFS_BMAPI_REMAP));
+                       if (isnullstartblock(bma.got.br_startblock))
+                               wasdelay = true;
+               }
 
                /*
                 * First, deal with the hole before the allocated space
                 * that we found, if any.
                 */
-               if (inhole || wasdelay) {
+               if (need_alloc || wasdelay) {
                        bma.eof = eof;
                        bma.conv = !!(flags & XFS_BMAPI_CONVERT);
                        bma.wasdel = wasdelay;
index cecd094..cdef87d 100644 (file)
@@ -110,6 +110,9 @@ struct xfs_extent_free_item
 /* Map something in the CoW fork. */
 #define XFS_BMAPI_COWFORK      0x200
 
+/* Only convert delalloc space, don't allocate entirely new extents */
+#define XFS_BMAPI_DELALLOC     0x400
+
 #define XFS_BMAPI_FLAGS \
        { XFS_BMAPI_ENTIRE,     "ENTIRE" }, \
        { XFS_BMAPI_METADATA,   "METADATA" }, \
@@ -120,7 +123,8 @@ struct xfs_extent_free_item
        { XFS_BMAPI_CONVERT,    "CONVERT" }, \
        { XFS_BMAPI_ZERO,       "ZERO" }, \
        { XFS_BMAPI_REMAP,      "REMAP" }, \
-       { XFS_BMAPI_COWFORK,    "COWFORK" }
+       { XFS_BMAPI_COWFORK,    "COWFORK" }, \
+       { XFS_BMAPI_DELALLOC,   "DELALLOC" }
 
 
 static inline int xfs_bmapi_aflag(int w)
index 0d14742..1aa3abd 100644 (file)
@@ -681,7 +681,7 @@ xfs_iomap_write_allocate(
        xfs_trans_t     *tp;
        int             nimaps;
        int             error = 0;
-       int             flags = 0;
+       int             flags = XFS_BMAPI_DELALLOC;
        int             nres;
 
        if (whichfork == XFS_COW_FORK)