Merge tag 'for-5.8-part2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 14 Jun 2020 16:47:25 +0000 (09:47 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 14 Jun 2020 16:47:25 +0000 (09:47 -0700)
Pull btrfs updates from David Sterba:
 "This reverts the direct io port to iomap infrastructure of btrfs
  merged in the first pull request. We found problems in invalidate page
  that don't seem to be fixable as regressions or without changing iomap
  code that would not affect other filesystems.

  There are four reverts in total, but three of them are followup
  cleanups needed to revert a43a67a2d715 cleanly. The result is the
  buffer head based implementation of direct io.

  Reverts are not great, but under current circumstances I don't see
  better options"

* tag 'for-5.8-part2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  Revert "btrfs: switch to iomap_dio_rw() for dio"
  Revert "fs: remove dio_end_io()"
  Revert "btrfs: remove BTRFS_INODE_READDIO_NEED_LOCK"
  Revert "btrfs: split btrfs_direct_IO to read and write part"

1  2 
fs/btrfs/inode.c
fs/direct-io.c
include/linux/fs.h

diff --combined fs/btrfs/inode.c
@@@ -5,6 -5,7 +5,7 @@@
  
  #include <linux/kernel.h>
  #include <linux/bio.h>
+ #include <linux/buffer_head.h>
  #include <linux/file.h>
  #include <linux/fs.h>
  #include <linux/pagemap.h>
@@@ -57,9 -58,9 +58,9 @@@ struct btrfs_iget_args 
  
  struct btrfs_dio_data {
        u64 reserve;
-       loff_t length;
-       ssize_t submitted;
-       struct extent_changeset *data_reserved;
+       u64 unsubmitted_oe_range_start;
+       u64 unsubmitted_oe_range_end;
+       int overwrite;
  };
  
  static const struct inode_operations btrfs_dir_inode_operations;
@@@ -4810,7 -4811,10 +4811,10 @@@ static int btrfs_setsize(struct inode *
  
                truncate_setsize(inode, newsize);
  
+               /* Disable nonlocked read DIO to avoid the endless truncate */
+               btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
                inode_dio_wait(inode);
+               btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
  
                ret = btrfs_truncate(inode, newsize == oldsize);
                if (ret && inode->i_nlink) {
@@@ -4904,8 -4908,8 +4908,8 @@@ static void evict_inode_truncate_pages(
  
        /*
         * Keep looping until we have no more ranges in the io tree.
 -       * We can have ongoing bios started by readpages (called from readahead)
 -       * that have their endio callback (extent_io.c:end_bio_extent_readpage)
 +       * We can have ongoing bios started by readahead that have
 +       * their endio callback (extent_io.c:end_bio_extent_readpage)
         * still in progress (unlocked the pages in the bio but did not yet
         * unlocked the ranges in the io tree). Therefore this means some
         * ranges can still be locked and eviction started because before
@@@ -7041,7 -7045,7 +7045,7 @@@ out
  }
  
  static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
-                             struct extent_state **cached_state, bool writing)
+                             struct extent_state **cached_state, int writing)
  {
        struct btrfs_ordered_extent *ordered;
        int ret = 0;
                         * for it to complete) and then invalidate the pages for
                         * this range (through invalidate_inode_pages2_range()),
                         * but that can lead us to a deadlock with a concurrent
 -                       * call to readpages() (a buffered read or a defrag call
 +                       * call to readahead (a buffered read or a defrag call
                         * triggered a readahead) on a page lock due to an
                         * ordered dio extent we created before but did not have
                         * yet a corresponding bio submitted (whence it can not
 -                       * complete), which makes readpages() wait for that
 +                       * complete), which makes readahead wait for that
                         * ordered extent to complete while holding a lock on
                         * that page.
                         */
@@@ -7179,7 -7183,30 +7183,30 @@@ static struct extent_map *create_io_em(
  }
  
  
+ static int btrfs_get_blocks_direct_read(struct extent_map *em,
+                                       struct buffer_head *bh_result,
+                                       struct inode *inode,
+                                       u64 start, u64 len)
+ {
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+       if (em->block_start == EXTENT_MAP_HOLE ||
+                       test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+               return -ENOENT;
+       len = min(len, em->len - (start - em->start));
+       bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+               inode->i_blkbits;
+       bh_result->b_size = len;
+       bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
+       set_buffer_mapped(bh_result);
+       return 0;
+ }
  static int btrfs_get_blocks_direct_write(struct extent_map **map,
+                                        struct buffer_head *bh_result,
                                         struct inode *inode,
                                         struct btrfs_dio_data *dio_data,
                                         u64 start, u64 len)
        }
  
        /* this will cow the extent */
+       len = bh_result->b_size;
        free_extent_map(em);
        *map = em = btrfs_new_extent_direct(inode, start, len);
        if (IS_ERR(em)) {
        len = min(len, em->len - (start - em->start));
  
  skip_cow:
+       bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+               inode->i_blkbits;
+       bh_result->b_size = len;
+       bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
+       set_buffer_mapped(bh_result);
+       if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+               set_buffer_new(bh_result);
        /*
         * Need to update the i_size under the extent lock so buffered
         * readers will get the updated i_size when we unlock.
         */
-       if (start + len > i_size_read(inode))
+       if (!dio_data->overwrite && start + len > i_size_read(inode))
                i_size_write(inode, start + len);
  
+       WARN_ON(dio_data->reserve < len);
        dio_data->reserve -= len;
+       dio_data->unsubmitted_oe_range_end = start + len;
+       current->journal_info = dio_data;
  out:
        return ret;
  }
  
- static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
-               loff_t length, unsigned flags, struct iomap *iomap,
-               struct iomap *srcmap)
+ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+                                  struct buffer_head *bh_result, int create)
  {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct extent_map *em;
        struct extent_state *cached_state = NULL;
        struct btrfs_dio_data *dio_data = NULL;
+       u64 start = iblock << inode->i_blkbits;
        u64 lockstart, lockend;
-       const bool write = !!(flags & IOMAP_WRITE);
+       u64 len = bh_result->b_size;
        int ret = 0;
-       u64 len = length;
-       bool unlock_extents = false;
  
-       if (!write)
+       if (!create)
                len = min_t(u64, len, fs_info->sectorsize);
  
        lockstart = start;
        lockend = start + len - 1;
  
-       /*
-        * The generic stuff only does filemap_write_and_wait_range, which
-        * isn't enough if we've written compressed pages to this area, so we
-        * need to flush the dirty pages again to make absolutely sure that any
-        * outstanding dirty pages are on disk.
-        */
-       if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-                    &BTRFS_I(inode)->runtime_flags))
-               ret = filemap_fdatawrite_range(inode->i_mapping, start,
-                                              start + length - 1);
-       dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
-       if (!dio_data)
-               return -ENOMEM;
-       dio_data->length = length;
-       if (write) {
-               dio_data->reserve = round_up(length, fs_info->sectorsize);
-               ret = btrfs_delalloc_reserve_space(inode,
-                               &dio_data->data_reserved,
-                               start, dio_data->reserve);
-               if (ret) {
-                       extent_changeset_free(dio_data->data_reserved);
-                       kfree(dio_data);
-                       return ret;
-               }
+       if (current->journal_info) {
+               /*
+                * Need to pull our outstanding extents and set journal_info to NULL so
+                * that anything that needs to check if there's a transaction doesn't get
+                * confused.
+                */
+               dio_data = current->journal_info;
+               current->journal_info = NULL;
        }
-       iomap->private = dio_data;
  
        /*
         * If this errors out it's because we couldn't invalidate pagecache for
         * this range and we need to fallback to buffered.
         */
-       if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
+       if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
+                              create)) {
                ret = -ENOTBLK;
                goto err;
        }
                goto unlock_err;
        }
  
-       len = min(len, em->len - (start - em->start));
-       if (write) {
-               ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
-                                                   start, len);
+       if (create) {
+               ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
+                                                   dio_data, start, len);
                if (ret < 0)
                        goto unlock_err;
-               unlock_extents = true;
-               /* Recalc len in case the new em is smaller than requested */
-               len = min(len, em->len - (start - em->start));
+               unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+                                    lockend, &cached_state);
        } else {
+               ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
+                                                  start, len);
+               /* Can be negative only if we read from a hole */
+               if (ret < 0) {
+                       ret = 0;
+                       free_extent_map(em);
+                       goto unlock_err;
+               }
                /*
                 * We need to unlock only the end area that we aren't using.
                 * The rest is going to be unlocked by the endio routine.
                 */
-               lockstart = start + len;
-               if (lockstart < lockend)
-                       unlock_extents = true;
-       }
-       if (unlock_extents)
-               unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                                    lockstart, lockend, &cached_state);
-       else
-               free_extent_state(cached_state);
-       /*
-        * Translate extent map information to iomap.
-        * We trim the extents (and move the addr) even though iomap code does
-        * that, since we have locked only the parts we are performing I/O in.
-        */
-       if ((em->block_start == EXTENT_MAP_HOLE) ||
-           (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
-               iomap->addr = IOMAP_NULL_ADDR;
-               iomap->type = IOMAP_HOLE;
-       } else {
-               iomap->addr = em->block_start + (start - em->start);
-               iomap->type = IOMAP_MAPPED;
+               lockstart = start + bh_result->b_size;
+               if (lockstart < lockend) {
+                       unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+                                            lockstart, lockend, &cached_state);
+               } else {
+                       free_extent_state(cached_state);
+               }
        }
-       iomap->offset = start;
-       iomap->bdev = fs_info->fs_devices->latest_bdev;
-       iomap->length = len;
  
        free_extent_map(em);
  
@@@ -7399,53 -7406,8 +7406,8 @@@ unlock_err
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                             &cached_state);
  err:
-       if (dio_data) {
-               btrfs_delalloc_release_space(inode, dio_data->data_reserved,
-                               start, dio_data->reserve, true);
-               btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
-               extent_changeset_free(dio_data->data_reserved);
-               kfree(dio_data);
-       }
-       return ret;
- }
- static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
-               ssize_t written, unsigned flags, struct iomap *iomap)
- {
-       int ret = 0;
-       struct btrfs_dio_data *dio_data = iomap->private;
-       size_t submitted = dio_data->submitted;
-       const bool write = !!(flags & IOMAP_WRITE);
-       if (!write && (iomap->type == IOMAP_HOLE)) {
-               /* If reading from a hole, unlock and return */
-               unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
-               goto out;
-       }
-       if (submitted < length) {
-               pos += submitted;
-               length -= submitted;
-               if (write)
-                       __endio_write_update_ordered(inode, pos, length, false);
-               else
-                       unlock_extent(&BTRFS_I(inode)->io_tree, pos,
-                                     pos + length - 1);
-               ret = -ENOTBLK;
-       }
-       if (write) {
-               if (dio_data->reserve)
-                       btrfs_delalloc_release_space(inode,
-                                       dio_data->data_reserved, pos,
-                                       dio_data->reserve, true);
-               btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
-               extent_changeset_free(dio_data->data_reserved);
-       }
- out:
-       kfree(dio_data);
-       iomap->private = NULL;
+       if (dio_data)
+               current->journal_info = dio_data;
        return ret;
  }
  
@@@ -7468,7 -7430,7 +7430,7 @@@ static void btrfs_dio_private_put(struc
                              dip->logical_offset + dip->bytes - 1);
        }
  
-       bio_endio(dip->dio_bio);
+       dio_end_io(dip->dio_bio);
        kfree(dip);
  }
  
@@@ -7704,11 -7666,24 +7666,24 @@@ static struct btrfs_dio_private *btrfs_
        dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
        dip->dio_bio = dio_bio;
        refcount_set(&dip->refs, 1);
+       if (write) {
+               struct btrfs_dio_data *dio_data = current->journal_info;
+               /*
+                * Setting range start and end to the same value means that
+                * no cleanup will happen in btrfs_direct_IO
+                */
+               dio_data->unsubmitted_oe_range_end = dip->logical_offset +
+                       dip->bytes;
+               dio_data->unsubmitted_oe_range_start =
+                       dio_data->unsubmitted_oe_range_end;
+       }
        return dip;
  }
  
- static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
-               struct bio *dio_bio, loff_t file_offset)
+ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
+                               loff_t file_offset)
  {
        const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
        const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
        int ret;
        blk_status_t status;
        struct btrfs_io_geometry geom;
-       struct btrfs_dio_data *dio_data = iomap->private;
  
        dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
        if (!dip) {
                                file_offset + dio_bio->bi_iter.bi_size - 1);
                }
                dio_bio->bi_status = BLK_STS_RESOURCE;
-               bio_endio(dio_bio);
-               return BLK_QC_T_NONE;
+               dio_end_io(dio_bio);
+               return;
        }
  
        if (!write && csum) {
                        goto out_err;
                }
  
-               dio_data->submitted += clone_len;
                clone_offset += clone_len;
                start_sector += clone_len >> 9;
                file_offset += clone_len;
        } while (submit_len > 0);
-       return BLK_QC_T_NONE;
+       return;
  
  out_err:
        dip->dio_bio->bi_status = status;
        btrfs_dio_private_put(dip);
-       return BLK_QC_T_NONE;
  }
  
- const struct iomap_ops btrfs_dio_iomap_ops = {
-       .iomap_begin            = btrfs_dio_iomap_begin,
-       .iomap_end              = btrfs_dio_iomap_end,
- };
+ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
+                              const struct iov_iter *iter, loff_t offset)
+ {
+       int seg;
+       int i;
+       unsigned int blocksize_mask = fs_info->sectorsize - 1;
+       ssize_t retval = -EINVAL;
  
- const struct iomap_dio_ops btrfs_dops = {
-       .submit_io              = btrfs_submit_direct,
- };
+       if (offset & blocksize_mask)
+               goto out;
+       if (iov_iter_alignment(iter) & blocksize_mask)
+               goto out;
+       /* If this is a write we don't need to check anymore */
+       if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter))
+               return 0;
+       /*
+        * Check to make sure we don't have duplicate iov_base's in this
+        * iovec, if so return EINVAL, otherwise we'll get csum errors
+        * when reading back.
+        */
+       for (seg = 0; seg < iter->nr_segs; seg++) {
+               for (i = seg + 1; i < iter->nr_segs; i++) {
+                       if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
+                               goto out;
+               }
+       }
+       retval = 0;
+ out:
+       return retval;
+ }
+ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+ {
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file->f_mapping->host;
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+       struct btrfs_dio_data dio_data = { 0 };
+       struct extent_changeset *data_reserved = NULL;
+       loff_t offset = iocb->ki_pos;
+       size_t count = 0;
+       int flags = 0;
+       bool wakeup = true;
+       bool relock = false;
+       ssize_t ret;
+       if (check_direct_IO(fs_info, iter, offset))
+               return 0;
+       inode_dio_begin(inode);
+       /*
+        * The generic stuff only does filemap_write_and_wait_range, which
+        * isn't enough if we've written compressed pages to this area, so
+        * we need to flush the dirty pages again to make absolutely sure
+        * that any outstanding dirty pages are on disk.
+        */
+       count = iov_iter_count(iter);
+       if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+                    &BTRFS_I(inode)->runtime_flags))
+               filemap_fdatawrite_range(inode->i_mapping, offset,
+                                        offset + count - 1);
+       if (iov_iter_rw(iter) == WRITE) {
+               /*
+                * If the write DIO is beyond the EOF, we need update
+                * the isize, but it is protected by i_mutex. So we can
+                * not unlock the i_mutex at this case.
+                */
+               if (offset + count <= inode->i_size) {
+                       dio_data.overwrite = 1;
+                       inode_unlock(inode);
+                       relock = true;
+               } else if (iocb->ki_flags & IOCB_NOWAIT) {
+                       ret = -EAGAIN;
+                       goto out;
+               }
+               ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
+                                                  offset, count);
+               if (ret)
+                       goto out;
+               /*
+                * We need to know how many extents we reserved so that we can
+                * do the accounting properly if we go over the number we
+                * originally calculated.  Abuse current->journal_info for this.
+                */
+               dio_data.reserve = round_up(count,
+                                           fs_info->sectorsize);
+               dio_data.unsubmitted_oe_range_start = (u64)offset;
+               dio_data.unsubmitted_oe_range_end = (u64)offset;
+               current->journal_info = &dio_data;
+               down_read(&BTRFS_I(inode)->dio_sem);
+       } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+                                    &BTRFS_I(inode)->runtime_flags)) {
+               inode_dio_end(inode);
+               flags = DIO_LOCKING | DIO_SKIP_HOLES;
+               wakeup = false;
+       }
+       ret = __blockdev_direct_IO(iocb, inode,
+                                  fs_info->fs_devices->latest_bdev,
+                                  iter, btrfs_get_blocks_direct, NULL,
+                                  btrfs_submit_direct, flags);
+       if (iov_iter_rw(iter) == WRITE) {
+               up_read(&BTRFS_I(inode)->dio_sem);
+               current->journal_info = NULL;
+               if (ret < 0 && ret != -EIOCBQUEUED) {
+                       if (dio_data.reserve)
+                               btrfs_delalloc_release_space(inode, data_reserved,
+                                       offset, dio_data.reserve, true);
+                       /*
+                        * On error we might have left some ordered extents
+                        * without submitting corresponding bios for them, so
+                        * cleanup them up to avoid other tasks getting them
+                        * and waiting for them to complete forever.
+                        */
+                       if (dio_data.unsubmitted_oe_range_start <
+                           dio_data.unsubmitted_oe_range_end)
+                               __endio_write_update_ordered(inode,
+                                       dio_data.unsubmitted_oe_range_start,
+                                       dio_data.unsubmitted_oe_range_end -
+                                       dio_data.unsubmitted_oe_range_start,
+                                       false);
+               } else if (ret >= 0 && (size_t)ret < count)
+                       btrfs_delalloc_release_space(inode, data_reserved,
+                                       offset, count - (size_t)ret, true);
+               btrfs_delalloc_release_extents(BTRFS_I(inode), count);
+       }
+ out:
+       if (wakeup)
+               inode_dio_end(inode);
+       if (relock)
+               inode_lock(inode);
+       extent_changeset_free(data_reserved);
+       return ret;
+ }
  
 -#define BTRFS_FIEMAP_FLAGS    (FIEMAP_FLAG_SYNC)
 -
  static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len)
  {
        int     ret;
  
 -      ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
 +      ret = fiemap_prep(inode, fieinfo, start, &len, 0);
        if (ret)
                return ret;
  
@@@ -7876,16 -7981,21 +7979,16 @@@ static int btrfs_writepages(struct addr
        return extent_writepages(mapping, wbc);
  }
  
 -static int
 -btrfs_readpages(struct file *file, struct address_space *mapping,
 -              struct list_head *pages, unsigned nr_pages)
 +static void btrfs_readahead(struct readahead_control *rac)
  {
 -      return extent_readpages(mapping, pages, nr_pages);
 +      extent_readahead(rac);
  }
  
  static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
  {
        int ret = try_release_extent_mapping(page, gfp_flags);
 -      if (ret == 1) {
 -              ClearPagePrivate(page);
 -              set_page_private(page, 0);
 -              put_page(page);
 -      }
 +      if (ret == 1)
 +              detach_page_private(page);
        return ret;
  }
  
@@@ -7907,8 -8017,14 +8010,8 @@@ static int btrfs_migratepage(struct add
        if (ret != MIGRATEPAGE_SUCCESS)
                return ret;
  
 -      if (page_has_private(page)) {
 -              ClearPagePrivate(page);
 -              get_page(newpage);
 -              set_page_private(newpage, page_private(page));
 -              set_page_private(page, 0);
 -              put_page(page);
 -              SetPagePrivate(newpage);
 -      }
 +      if (page_has_private(page))
 +              attach_page_private(newpage, detach_page_private(page));
  
        if (PagePrivate2(page)) {
                ClearPagePrivate2(page);
@@@ -8030,7 -8146,11 +8133,7 @@@ again
        }
  
        ClearPageChecked(page);
 -      if (PagePrivate(page)) {
 -              ClearPagePrivate(page);
 -              set_page_private(page, 0);
 -              put_page(page);
 -      }
 +      detach_page_private(page);
  }
  
  /*
@@@ -10121,8 -10241,8 +10224,8 @@@ static const struct address_space_opera
        .readpage       = btrfs_readpage,
        .writepage      = btrfs_writepage,
        .writepages     = btrfs_writepages,
 -      .readpages      = btrfs_readpages,
 +      .readahead      = btrfs_readahead,
-       .direct_IO      = noop_direct_IO,
+       .direct_IO      = btrfs_direct_IO,
        .invalidatepage = btrfs_invalidatepage,
        .releasepage    = btrfs_releasepage,
  #ifdef CONFIG_MIGRATION
diff --combined fs/direct-io.c
@@@ -386,6 -386,25 +386,25 @@@ static void dio_bio_end_io(struct bio *
        spin_unlock_irqrestore(&dio->bio_lock, flags);
  }
  
+ /**
+  * dio_end_io - handle the end io action for the given bio
+  * @bio: The direct io bio thats being completed
+  *
+  * This is meant to be called by any filesystem that uses their own dio_submit_t
+  * so that the DIO specific endio actions are dealt with after the filesystem
+  * has done it's completion work.
+  */
+ void dio_end_io(struct bio *bio)
+ {
+       struct dio *dio = bio->bi_private;
+       if (dio->is_async)
+               dio_bio_end_aio(bio);
+       else
+               dio_bio_end_io(bio);
+ }
+ EXPORT_SYMBOL_GPL(dio_end_io);
  static inline void
  dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
              struct block_device *bdev,
@@@ -481,7 -500,7 +500,7 @@@ static struct bio *dio_await_one(struc
                spin_unlock_irqrestore(&dio->bio_lock, flags);
                if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
                    !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true))
 -                      io_schedule();
 +                      blk_io_schedule();
                /* wake up sets us TASK_RUNNING */
                spin_lock_irqsave(&dio->bio_lock, flags);
                dio->waiter = NULL;
diff --combined include/linux/fs.h
@@@ -24,6 -24,7 +24,6 @@@
  #include <linux/capability.h>
  #include <linux/semaphore.h>
  #include <linux/fcntl.h>
 -#include <linux/fiemap.h>
  #include <linux/rculist_bl.h>
  #include <linux/atomic.h>
  #include <linux/shrinker.h>
@@@ -47,7 -48,6 +47,7 @@@ struct backing_dev_info
  struct bdi_writeback;
  struct bio;
  struct export_operations;
 +struct fiemap_extent_info;
  struct hd_geometry;
  struct iovec;
  struct kiocb;
@@@ -292,7 -292,6 +292,7 @@@ enum positive_aop_returns 
  struct page;
  struct address_space;
  struct writeback_control;
 +struct readahead_control;
  
  /*
   * Write life time hint values.
@@@ -376,7 -375,6 +376,7 @@@ struct address_space_operations 
         */
        int (*readpages)(struct file *filp, struct address_space *mapping,
                        struct list_head *pages, unsigned nr_pages);
 +      void (*readahead)(struct readahead_control *);
  
        int (*write_begin)(struct file *, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned flags,
@@@ -978,7 -976,6 +978,7 @@@ struct file 
  #endif /* #ifdef CONFIG_EPOLL */
        struct address_space    *f_mapping;
        errseq_t                f_wb_err;
 +      errseq_t                f_sb_err; /* for syncfs */
  } __randomize_layout
    __attribute__((aligned(4)));        /* lest something weird decides that 2 is OK */
  
@@@ -1048,7 -1045,6 +1048,7 @@@ struct lock_manager_operations 
        bool (*lm_break)(struct file_lock *);
        int (*lm_change)(struct file_lock *, int, struct list_head *);
        void (*lm_setup)(struct file_lock *, void **);
 +      bool (*lm_breaker_owns_lease)(struct file_lock *);
  };
  
  struct lock_manager {
@@@ -1413,8 -1409,6 +1413,8 @@@ extern int send_sigurg(struct fown_stru
  #define SB_I_IMA_UNVERIFIABLE_SIGNATURE       0x00000020
  #define SB_I_UNTRUSTED_MOUNTER                0x00000040
  
 +#define SB_I_SKIP_SYNC        0x00000100      /* Skip superblock at global sync */
 +
  /* Possible states of 'frozen' field */
  enum {
        SB_UNFROZEN = 0,                /* FS is unfrozen */
@@@ -1526,9 -1520,6 +1526,9 @@@ struct super_block 
        /* Being remounted read-only */
        int s_readonly_remount;
  
 +      /* per-sb errseq_t for reporting writeback errors via syncfs */
 +      errseq_t s_wb_err;
 +
        /* AIO completions deferred from interrupt context */
        struct workqueue_struct *s_dio_done_wq;
        struct hlist_head s_pins;
@@@ -1682,10 -1673,10 +1682,10 @@@ static inline int sb_start_write_tryloc
   *
   * Since page fault freeze protection behaves as a lock, users have to preserve
   * ordering of freeze protection and other filesystem locks. It is advised to
 - * put sb_start_pagefault() close to mmap_sem in lock ordering. Page fault
 + * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault
   * handling code implies lock dependency:
   *
 - * mmap_sem
 + * mmap_lock
   *   -> sb_start_pagefault
   */
  static inline void sb_start_pagefault(struct super_block *sb)
@@@ -1730,11 -1721,7 +1730,11 @@@ extern int vfs_link(struct dentry *, st
  extern int vfs_rmdir(struct inode *, struct dentry *);
  extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
  extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
 -extern int vfs_whiteout(struct inode *, struct dentry *);
 +
 +static inline int vfs_whiteout(struct inode *dir, struct dentry *dentry)
 +{
 +      return vfs_mknod(dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
 +}
  
  extern struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode,
                                  int open_flag);
@@@ -1758,6 -1745,19 +1758,6 @@@ extern long compat_ptr_ioctl(struct fil
  extern void inode_init_owner(struct inode *inode, const struct inode *dir,
                        umode_t mode);
  extern bool may_open_dev(const struct path *path);
 -/*
 - * VFS FS_IOC_FIEMAP helper definitions.
 - */
 -struct fiemap_extent_info {
 -      unsigned int fi_flags;          /* Flags as passed from user */
 -      unsigned int fi_extents_mapped; /* Number of mapped extents */
 -      unsigned int fi_extents_max;    /* Size of fiemap_extent array */
 -      struct fiemap_extent __user *fi_extents_start; /* Start of
 -                                                      fiemap_extent array */
 -};
 -int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical,
 -                          u64 phys, u64 len, u32 flags);
 -int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
  
  /*
   * This is the "filldir" function type, used by readdir() to let
@@@ -2156,8 -2156,6 +2156,8 @@@ static inline void kiocb_clone(struct k
   *
   * I_CREATING         New object's inode in the middle of setting up.
   *
 + * I_DONTCACHE                Evict inode as soon as it is not used anymore.
 + *
   * Q: What is the difference between I_WILL_FREE and I_FREEING?
   */
  #define I_DIRTY_SYNC          (1 << 0)
  #define I_WB_SWITCH           (1 << 13)
  #define I_OVL_INUSE           (1 << 14)
  #define I_CREATING            (1 << 15)
 +#define I_DONTCACHE           (1 << 16)
  
  #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
  #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
@@@ -2584,6 -2581,7 +2584,6 @@@ extern struct kmem_cache *names_cachep
  #ifdef CONFIG_BLOCK
  extern int register_blkdev(unsigned int, const char *);
  extern void unregister_blkdev(unsigned int, const char *);
 -extern void bdev_unhash_inode(dev_t dev);
  extern struct block_device *bdget(dev_t);
  extern struct block_device *bdgrab(struct block_device *bdev);
  extern void bd_set_size(struct block_device *, loff_t size);
@@@ -2639,6 -2637,7 +2639,6 @@@ extern int sync_filesystem(struct super
  extern const struct file_operations def_blk_fops;
  extern const struct file_operations def_chr_fops;
  #ifdef CONFIG_BLOCK
 -extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
  extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
  extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
  extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder);
@@@ -2724,6 -2723,7 +2724,6 @@@ extern bool is_bad_inode(struct inode *
  extern int revalidate_disk(struct gendisk *);
  extern int check_disk_change(struct block_device *);
  extern int __invalidate_device(struct block_device *, bool);
 -extern int invalidate_partition(struct gendisk *, int);
  #endif
  unsigned long invalidate_mapping_pages(struct address_space *mapping,
                                        pgoff_t start, pgoff_t end);
@@@ -2827,18 -2827,6 +2827,18 @@@ static inline errseq_t filemap_sample_w
        return errseq_sample(&mapping->wb_err);
  }
  
 +/**
 + * file_sample_sb_err - sample the current errseq_t to test for later errors
 + * @mapping: mapping to be sampled
 + *
 + * Grab the most current superblock-level errseq_t value for the given
 + * struct file.
 + */
 +static inline errseq_t file_sample_sb_err(struct file *file)
 +{
 +      return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
 +}
 +
  static inline int filemap_nr_thps(struct address_space *mapping)
  {
  #ifdef CONFIG_READ_ONLY_THP_FOR_FS
@@@ -3061,10 -3049,8 +3061,10 @@@ extern int inode_needs_sync(struct inod
  extern int generic_delete_inode(struct inode *inode);
  static inline int generic_drop_inode(struct inode *inode)
  {
 -      return !inode->i_nlink || inode_unhashed(inode);
 +      return !inode->i_nlink || inode_unhashed(inode) ||
 +              (inode->i_state & I_DONTCACHE);
  }
 +extern void d_mark_dontcache(struct inode *inode);
  
  extern struct inode *ilookup5_nowait(struct super_block *sb,
                unsigned long hashval, int (*test)(struct inode *, void *),
@@@ -3084,9 -3070,6 +3084,9 @@@ extern struct inode *find_inode_nowait(
                                       int (*match)(struct inode *,
                                                    unsigned long, void *),
                                       void *data);
 +extern struct inode *find_inode_rcu(struct super_block *, unsigned long,
 +                                  int (*)(struct inode *, void *), void *);
 +extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long);
  extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
  extern int insert_inode_locked(struct inode *);
  #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@@ -3204,6 -3187,8 +3204,8 @@@ enum 
        DIO_SKIP_HOLES  = 0x02,
  };
  
+ void dio_end_io(struct bio *bio);
  ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
                             struct block_device *bdev, struct iov_iter *iter,
                             get_block_t get_block,
@@@ -3316,6 -3301,14 +3318,6 @@@ static inline int vfs_fstat(int fd, str
  extern const char *vfs_get_link(struct dentry *, struct delayed_call *);
  extern int vfs_readlink(struct dentry *, char __user *, int);
  
 -extern int __generic_block_fiemap(struct inode *inode,
 -                                struct fiemap_extent_info *fieinfo,
 -                                loff_t start, loff_t len,
 -                                get_block_t *get_block);
 -extern int generic_block_fiemap(struct inode *inode,
 -                              struct fiemap_extent_info *fieinfo, u64 start,
 -                              u64 len, get_block_t *get_block);
 -
  extern struct file_system_type *get_filesystem(struct file_system_type *fs);
  extern void put_filesystem(struct file_system_type *fs);
  extern struct file_system_type *get_fs_type(const char *name);
@@@ -3403,6 -3396,11 +3405,6 @@@ extern void setattr_copy(struct inode *
  
  extern int file_update_time(struct file *file);
  
 -static inline bool io_is_direct(struct file *filp)
 -{
 -      return (filp->f_flags & O_DIRECT) || IS_DAX(filp->f_mapping->host);
 -}
 -
  static inline bool vma_is_dax(const struct vm_area_struct *vma)
  {
        return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
@@@ -3427,7 -3425,7 +3429,7 @@@ static inline int iocb_flags(struct fil
        int res = 0;
        if (file->f_flags & O_APPEND)
                res |= IOCB_APPEND;
 -      if (io_is_direct(file))
 +      if (file->f_flags & O_DIRECT)
                res |= IOCB_DIRECT;
        if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host))
                res |= IOCB_DSYNC;
@@@ -3540,11 -3538,11 +3542,11 @@@ ssize_t simple_attr_write(struct file *
  
  struct ctl_table;
  int proc_nr_files(struct ctl_table *table, int write,
 -                void __user *buffer, size_t *lenp, loff_t *ppos);
 +                void *buffer, size_t *lenp, loff_t *ppos);
  int proc_nr_dentry(struct ctl_table *table, int write,
 -                void __user *buffer, size_t *lenp, loff_t *ppos);
 +                void *buffer, size_t *lenp, loff_t *ppos);
  int proc_nr_inodes(struct ctl_table *table, int write,
 -                 void __user *buffer, size_t *lenp, loff_t *ppos);
 +                 void *buffer, size_t *lenp, loff_t *ppos);
  int __init get_filesystem_list(char *buf);
  
  #define __FMODE_EXEC          ((__force int) FMODE_EXEC)