Merge tag 'fuse-update-5.10' of git://git.kernel.org/pub/scm/linux/kernel/git/mszered...

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 19 Oct 2020 21:28:30 +0000 (14:28 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 19 Oct 2020 21:28:30 +0000 (14:28 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 19 Oct 2020 21:28:30 +0000 (14:28 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 19 Oct 2020 21:28:30 +0000 (14:28 -0700)
diff --git a/Documentation/filesystems/fuse.rst b/Documentation/filesystems/fuse.rst

index cd717f9..8120c3c 100644 (file)
--- a/Documentation/filesystems/fuse.rst
+++ b/Documentation/filesystems/fuse.rst
@@ -47,7 +47,7 @@ filesystems.  A good example is sshfs: a secure network filesystem
  using the sftp protocol.
  
  The userspace library and utilities are available from the
-`FUSE homepage: <http://fuse.sourceforge.net/>`_
+`FUSE homepage: <https://github.com/libfuse/>`_
  
  Filesystem type
  ===============
diff --git a/MAINTAINERS b/MAINTAINERS

index 5919b75..084cb79 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7238,7 +7238,7 @@ FUSE: FILESYSTEM IN USERSPACE
  M:     Miklos Szeredi <miklos@szeredi.hu>
  L:     linux-fsdevel@vger.kernel.org
  S:     Maintained
-W:     http://fuse.sourceforge.net/
+W:     https://github.com/libfuse/
  T:     git git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git
  F:     Documentation/filesystems/fuse.rst
  F:     fs/fuse/
diff --git a/drivers/dax/super.c b/drivers/dax/super.c

index e84070b..edc279b 100644 (file)
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -46,7 +46,8 @@ EXPORT_SYMBOL_GPL(dax_read_unlock);
  int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
                 pgoff_t *pgoff)
  {
-       phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512;
+       sector_t start_sect = bdev ? get_start_sect(bdev) : 0;
+       phys_addr_t phys_off = (start_sect + sector) * 512;
  
         if (pgoff)
                 *pgoff = PHYS_PFN(phys_off);
diff --git a/fs/dax.c b/fs/dax.c

index 6ad3463..5b47834 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -559,8 +559,11 @@ fallback:
  }
  
  /**
- * dax_layout_busy_page - find first pinned page in @mapping
+ * dax_layout_busy_page_range - find first pinned page in @mapping
   * @mapping: address space to scan for a page with ref count > 1
+ * @start: Starting offset. Page containing 'start' is included.
+ * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
+ *       pages from 'start' till the end of file are included.
   *
   * DAX requires ZONE_DEVICE mapped pages. These pages are never
   * 'onlined' to the page allocator so they are considered idle when
@@ -573,12 +576,15 @@ fallback:
   * to be able to run unmap_mapping_range() and subsequently not race
   * mapping_mapped() becoming true.
   */
-struct page *dax_layout_busy_page(struct address_space *mapping)
+struct page *dax_layout_busy_page_range(struct address_space *mapping,
+                                       loff_t start, loff_t end)
  {
-       XA_STATE(xas, &mapping->i_pages, 0);
         void *entry;
         unsigned int scanned = 0;
         struct page *page = NULL;
+       pgoff_t start_idx = start >> PAGE_SHIFT;
+       pgoff_t end_idx;
+       XA_STATE(xas, &mapping->i_pages, start_idx);
  
         /*
          * In the 'limited' case get_user_pages() for dax is disabled.
@@ -589,6 +595,11 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
         if (!dax_mapping(mapping) || !mapping_mapped(mapping))
                 return NULL;
  
+       /* If end == LLONG_MAX, all pages from start to till end of file */
+       if (end == LLONG_MAX)
+               end_idx = ULONG_MAX;
+       else
+               end_idx = end >> PAGE_SHIFT;
         /*
          * If we race get_user_pages_fast() here either we'll see the
          * elevated page count in the iteration and wait, or
@@ -596,15 +607,15 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
          * against is no longer mapped in the page tables and bail to the
          * get_user_pages() slow path.  The slow path is protected by
          * pte_lock() and pmd_lock(). New references are not taken without
-        * holding those locks, and unmap_mapping_range() will not zero the
+        * holding those locks, and unmap_mapping_pages() will not zero the
          * pte or pmd without holding the respective lock, so we are
          * guaranteed to either see new references or prevent new
          * references from being established.
          */
-       unmap_mapping_range(mapping, 0, 0, 0);
+       unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
  
         xas_lock_irq(&xas);
-       xas_for_each(&xas, entry, ULONG_MAX) {
+       xas_for_each(&xas, entry, end_idx) {
                 if (WARN_ON_ONCE(!xa_is_value(entry)))
                         continue;
                 if (unlikely(dax_is_locked(entry)))
@@ -625,6 +636,12 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
         xas_unlock_irq(&xas);
         return page;
  }
+EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
+
+struct page *dax_layout_busy_page(struct address_space *mapping)
+{
+       return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
+}
  EXPORT_SYMBOL_GPL(dax_layout_busy_page);
  
  static int __dax_invalidate_entry(struct address_space *mapping,
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig

index 774b261..40ce9a1 100644 (file)
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -8,7 +8,7 @@ config FUSE_FS
  
           There's also a companion library: libfuse2.  This library is available
           from the FUSE homepage:
-         <http://fuse.sourceforge.net/>
+         <https://github.com/libfuse/>
           although chances are your distribution already has that library
           installed if you've installed the "fuse" package itself.
  
@@ -38,3 +38,17 @@ config VIRTIO_FS
  
           If you want to share files between guests or with the host, answer Y
           or M.
+
+config FUSE_DAX
+       bool "Virtio Filesystem Direct Host Memory Access support"
+       default y
+       select INTERVAL_TREE
+       depends on VIRTIO_FS
+       depends on FS_DAX
+       depends on DAX_DRIVER
+       help
+         This allows bypassing guest page cache and allows mapping host page
+         cache directly in guest address space.
+
+         If you want to allow mounting a Virtio Filesystem with the "dax"
+         option, answer Y.
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile

index 3e8cebf..8c7021f 100644 (file)
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -7,5 +7,7 @@ obj-$(CONFIG_FUSE_FS) += fuse.o
  obj-$(CONFIG_CUSE) += cuse.o
  obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
  
-fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
-virtiofs-y += virtio_fs.o
+fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
+fuse-$(CONFIG_FUSE_DAX) += dax.o
+
+virtiofs-y := virtio_fs.o
diff --git a/fs/fuse/control.c b/fs/fuse/control.c

index a1303ad..cc7e94d 100644 (file)
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -164,6 +164,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
  {
         unsigned val;
         struct fuse_conn *fc;
+       struct fuse_mount *fm;
         ssize_t ret;
  
         ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
@@ -174,18 +175,27 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
         if (!fc)
                 goto out;
  
+       down_read(&fc->killsb);
         spin_lock(&fc->bg_lock);
         fc->congestion_threshold = val;
-       if (fc->sb) {
+
+       /*
+        * Get any fuse_mount belonging to this fuse_conn; s_bdi is
+        * shared between all of them
+        */
+
+       if (!list_empty(&fc->mounts)) {
+               fm = list_first_entry(&fc->mounts, struct fuse_mount, fc_entry);
                 if (fc->num_background < fc->congestion_threshold) {
-                       clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
-                       clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+                       clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
+                       clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
                 } else {
-                       set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
-                       set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+                       set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
+                       set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
                 }
         }
         spin_unlock(&fc->bg_lock);
+       up_read(&fc->killsb);
         fuse_conn_put(fc);
  out:
         return ret;
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c

index 2cc1781..4508226 100644 (file)
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -57,6 +57,7 @@
  
  struct cuse_conn {
         struct list_head        list;   /* linked on cuse_conntbl */
+       struct fuse_mount       fm;     /* Dummy mount referencing fc */
         struct fuse_conn        fc;     /* fuse connection */
         struct cdev             *cdev;  /* associated character device */
         struct device           *dev;   /* device representing @cdev */
@@ -134,7 +135,7 @@ static int cuse_open(struct inode *inode, struct file *file)
          * Generic permission check is already done against the chrdev
          * file, proceed to open.
          */
-       rc = fuse_do_open(&cc->fc, 0, file, 0);
+       rc = fuse_do_open(&cc->fm, 0, file, 0);
         if (rc)
                 fuse_conn_put(&cc->fc);
         return rc;
@@ -143,10 +144,10 @@ static int cuse_open(struct inode *inode, struct file *file)
  static int cuse_release(struct inode *inode, struct file *file)
  {
         struct fuse_file *ff = file->private_data;
-       struct fuse_conn *fc = ff->fc;
+       struct fuse_mount *fm = ff->fm;
  
         fuse_sync_release(NULL, ff, file->f_flags);
-       fuse_conn_put(fc);
+       fuse_conn_put(fm->fc);
  
         return 0;
  }
@@ -155,7 +156,7 @@ static long cuse_file_ioctl(struct file *file, unsigned int cmd,
                             unsigned long arg)
  {
         struct fuse_file *ff = file->private_data;
-       struct cuse_conn *cc = fc_to_cc(ff->fc);
+       struct cuse_conn *cc = fc_to_cc(ff->fm->fc);
         unsigned int flags = 0;
  
         if (cc->unrestricted_ioctl)
@@ -168,7 +169,7 @@ static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd,
                                    unsigned long arg)
  {
         struct fuse_file *ff = file->private_data;
-       struct cuse_conn *cc = fc_to_cc(ff->fc);
+       struct cuse_conn *cc = fc_to_cc(ff->fm->fc);
         unsigned int flags = FUSE_IOCTL_COMPAT;
  
         if (cc->unrestricted_ioctl)
@@ -313,9 +314,10 @@ struct cuse_init_args {
   * required data structures for it.  Please read the comment at the
   * top of this file for high level overview.
   */
-static void cuse_process_init_reply(struct fuse_conn *fc,
+static void cuse_process_init_reply(struct fuse_mount *fm,
                                     struct fuse_args *args, int error)
  {
+       struct fuse_conn *fc = fm->fc;
         struct cuse_init_args *ia = container_of(args, typeof(*ia), ap.args);
         struct fuse_args_pages *ap = &ia->ap;
         struct cuse_conn *cc = fc_to_cc(fc), *pos;
@@ -424,7 +426,7 @@ static int cuse_send_init(struct cuse_conn *cc)
  {
         int rc;
         struct page *page;
-       struct fuse_conn *fc = &cc->fc;
+       struct fuse_mount *fm = &cc->fm;
         struct cuse_init_args *ia;
         struct fuse_args_pages *ap;
  
@@ -460,7 +462,7 @@ static int cuse_send_init(struct cuse_conn *cc)
         ia->desc.length = ap->args.out_args[1].size;
         ap->args.end = cuse_process_init_reply;
  
-       rc = fuse_simple_background(fc, &ap->args, GFP_KERNEL);
+       rc = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
         if (rc) {
                 kfree(ia);
  err_free_page:
@@ -506,7 +508,8 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
          * Limit the cuse channel to requests that can
          * be represented in file->f_cred->user_ns.
          */
-       fuse_conn_init(&cc->fc, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL);
+       fuse_conn_init(&cc->fc, &cc->fm, file->f_cred->user_ns,
+                      &fuse_dev_fiq_ops, NULL);
  
         fud = fuse_dev_alloc_install(&cc->fc);
         if (!fud) {
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c

new file mode 100644 (file)

index 0000000..ff99ab2
--- /dev/null
+++ b/fs/fuse/dax.c
@@ -0,0 +1,1365 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * dax: direct host memory access
+ * Copyright (C) 2020 Red Hat, Inc.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/delay.h>
+#include <linux/dax.h>
+#include <linux/uio.h>
+#include <linux/pfn_t.h>
+#include <linux/iomap.h>
+#include <linux/interval_tree.h>
+
+/*
+ * Default memory range size.  A power of 2 so it agrees with common FUSE_INIT
+ * map_alignment values 4KB and 64KB.
+ */
+#define FUSE_DAX_SHIFT 21
+#define FUSE_DAX_SZ    (1 << FUSE_DAX_SHIFT)
+#define FUSE_DAX_PAGES (FUSE_DAX_SZ / PAGE_SIZE)
+
+/* Number of ranges reclaimer will try to free in one invocation */
+#define FUSE_DAX_RECLAIM_CHUNK         (10)
+
+/*
+ * Dax memory reclaim threshold in percetage of total ranges. When free
+ * number of free ranges drops below this threshold, reclaim can trigger
+ * Default is 20%
+ */
+#define FUSE_DAX_RECLAIM_THRESHOLD     (20)
+
+/** Translation information for file offsets to DAX window offsets */
+struct fuse_dax_mapping {
+       /* Pointer to inode where this memory range is mapped */
+       struct inode *inode;
+
+       /* Will connect in fcd->free_ranges to keep track of free memory */
+       struct list_head list;
+
+       /* For interval tree in file/inode */
+       struct interval_tree_node itn;
+
+       /* Will connect in fc->busy_ranges to keep track busy memory */
+       struct list_head busy_list;
+
+       /** Position in DAX window */
+       u64 window_offset;
+
+       /** Length of mapping, in bytes */
+       loff_t length;
+
+       /* Is this mapping read-only or read-write */
+       bool writable;
+
+       /* reference count when the mapping is used by dax iomap. */
+       refcount_t refcnt;
+};
+
+/* Per-inode dax map */
+struct fuse_inode_dax {
+       /* Semaphore to protect modifications to the dmap tree */
+       struct rw_semaphore sem;
+
+       /* Sorted rb tree of struct fuse_dax_mapping elements */
+       struct rb_root_cached tree;
+       unsigned long nr;
+};
+
+struct fuse_conn_dax {
+       /* DAX device */
+       struct dax_device *dev;
+
+       /* Lock protecting accessess to  members of this structure */
+       spinlock_t lock;
+
+       /* List of memory ranges which are busy */
+       unsigned long nr_busy_ranges;
+       struct list_head busy_ranges;
+
+       /* Worker to free up memory ranges */
+       struct delayed_work free_work;
+
+       /* Wait queue for a dax range to become free */
+       wait_queue_head_t range_waitq;
+
+       /* DAX Window Free Ranges */
+       long nr_free_ranges;
+       struct list_head free_ranges;
+
+       unsigned long nr_ranges;
+};
+
+static inline struct fuse_dax_mapping *
+node_to_dmap(struct interval_tree_node *node)
+{
+       if (!node)
+               return NULL;
+
+       return container_of(node, struct fuse_dax_mapping, itn);
+}
+
+static struct fuse_dax_mapping *
+alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode);
+
+static void
+__kick_dmap_free_worker(struct fuse_conn_dax *fcd, unsigned long delay_ms)
+{
+       unsigned long free_threshold;
+
+       /* If number of free ranges are below threshold, start reclaim */
+       free_threshold = max_t(unsigned long, fcd->nr_ranges * FUSE_DAX_RECLAIM_THRESHOLD / 100,
+                            1);
+       if (fcd->nr_free_ranges < free_threshold)
+               queue_delayed_work(system_long_wq, &fcd->free_work,
+                                  msecs_to_jiffies(delay_ms));
+}
+
+static void kick_dmap_free_worker(struct fuse_conn_dax *fcd,
+                                 unsigned long delay_ms)
+{
+       spin_lock(&fcd->lock);
+       __kick_dmap_free_worker(fcd, delay_ms);
+       spin_unlock(&fcd->lock);
+}
+
+static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd)
+{
+       struct fuse_dax_mapping *dmap;
+
+       spin_lock(&fcd->lock);
+       dmap = list_first_entry_or_null(&fcd->free_ranges,
+                                       struct fuse_dax_mapping, list);
+       if (dmap) {
+               list_del_init(&dmap->list);
+               WARN_ON(fcd->nr_free_ranges <= 0);
+               fcd->nr_free_ranges--;
+       }
+       spin_unlock(&fcd->lock);
+
+       kick_dmap_free_worker(fcd, 0);
+       return dmap;
+}
+
+/* This assumes fcd->lock is held */
+static void __dmap_remove_busy_list(struct fuse_conn_dax *fcd,
+                                   struct fuse_dax_mapping *dmap)
+{
+       list_del_init(&dmap->busy_list);
+       WARN_ON(fcd->nr_busy_ranges == 0);
+       fcd->nr_busy_ranges--;
+}
+
+static void dmap_remove_busy_list(struct fuse_conn_dax *fcd,
+                                 struct fuse_dax_mapping *dmap)
+{
+       spin_lock(&fcd->lock);
+       __dmap_remove_busy_list(fcd, dmap);
+       spin_unlock(&fcd->lock);
+}
+
+/* This assumes fcd->lock is held */
+static void __dmap_add_to_free_pool(struct fuse_conn_dax *fcd,
+                               struct fuse_dax_mapping *dmap)
+{
+       list_add_tail(&dmap->list, &fcd->free_ranges);
+       fcd->nr_free_ranges++;
+       wake_up(&fcd->range_waitq);
+}
+
+static void dmap_add_to_free_pool(struct fuse_conn_dax *fcd,
+                               struct fuse_dax_mapping *dmap)
+{
+       /* Return fuse_dax_mapping to free list */
+       spin_lock(&fcd->lock);
+       __dmap_add_to_free_pool(fcd, dmap);
+       spin_unlock(&fcd->lock);
+}
+
+static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx,
+                                 struct fuse_dax_mapping *dmap, bool writable,
+                                 bool upgrade)
+{
+       struct fuse_mount *fm = get_fuse_mount(inode);
+       struct fuse_conn_dax *fcd = fm->fc->dax;
+       struct fuse_inode *fi = get_fuse_inode(inode);
+       struct fuse_setupmapping_in inarg;
+       loff_t offset = start_idx << FUSE_DAX_SHIFT;
+       FUSE_ARGS(args);
+       ssize_t err;
+
+       WARN_ON(fcd->nr_free_ranges < 0);
+
+       /* Ask fuse daemon to setup mapping */
+       memset(&inarg, 0, sizeof(inarg));
+       inarg.foffset = offset;
+       inarg.fh = -1;
+       inarg.moffset = dmap->window_offset;
+       inarg.len = FUSE_DAX_SZ;
+       inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ;
+       if (writable)
+               inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
+       args.opcode = FUSE_SETUPMAPPING;
+       args.nodeid = fi->nodeid;
+       args.in_numargs = 1;
+       args.in_args[0].size = sizeof(inarg);
+       args.in_args[0].value = &inarg;
+       err = fuse_simple_request(fm, &args);
+       if (err < 0)
+               return err;
+       dmap->writable = writable;
+       if (!upgrade) {
+               /*
+                * We don't take a refernce on inode. inode is valid right now
+                * and when inode is going away, cleanup logic should first
+                * cleanup dmap entries.
+                */
+               dmap->inode = inode;
+               dmap->itn.start = dmap->itn.last = start_idx;
+               /* Protected by fi->dax->sem */
+               interval_tree_insert(&dmap->itn, &fi->dax->tree);
+               fi->dax->nr++;
+               spin_lock(&fcd->lock);
+               list_add_tail(&dmap->busy_list, &fcd->busy_ranges);
+               fcd->nr_busy_ranges++;
+               spin_unlock(&fcd->lock);
+       }
+       return 0;
+}
+
+static int fuse_send_removemapping(struct inode *inode,
+                                  struct fuse_removemapping_in *inargp,
+                                  struct fuse_removemapping_one *remove_one)
+{
+       struct fuse_inode *fi = get_fuse_inode(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
+       FUSE_ARGS(args);
+
+       args.opcode = FUSE_REMOVEMAPPING;
+       args.nodeid = fi->nodeid;
+       args.in_numargs = 2;
+       args.in_args[0].size = sizeof(*inargp);
+       args.in_args[0].value = inargp;
+       args.in_args[1].size = inargp->count * sizeof(*remove_one);
+       args.in_args[1].value = remove_one;
+       return fuse_simple_request(fm, &args);
+}
+
+static int dmap_removemapping_list(struct inode *inode, unsigned int num,
+                                  struct list_head *to_remove)
+{
+       struct fuse_removemapping_one *remove_one, *ptr;
+       struct fuse_removemapping_in inarg;
+       struct fuse_dax_mapping *dmap;
+       int ret, i = 0, nr_alloc;
+
+       nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY);
+       remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS);
+       if (!remove_one)
+               return -ENOMEM;
+
+       ptr = remove_one;
+       list_for_each_entry(dmap, to_remove, list) {
+               ptr->moffset = dmap->window_offset;
+               ptr->len = dmap->length;
+               ptr++;
+               i++;
+               num--;
+               if (i >= nr_alloc || num == 0) {
+                       memset(&inarg, 0, sizeof(inarg));
+                       inarg.count = i;
+                       ret = fuse_send_removemapping(inode, &inarg,
+                                                     remove_one);
+                       if (ret)
+                               goto out;
+                       ptr = remove_one;
+                       i = 0;
+               }
+       }
+out:
+       kfree(remove_one);
+       return ret;
+}
+
+/*
+ * Cleanup dmap entry and add back to free list. This should be called with
+ * fcd->lock held.
+ */
+static void dmap_reinit_add_to_free_pool(struct fuse_conn_dax *fcd,
+                                           struct fuse_dax_mapping *dmap)
+{
+       pr_debug("fuse: freeing memory range start_idx=0x%lx end_idx=0x%lx window_offset=0x%llx length=0x%llx\n",
+                dmap->itn.start, dmap->itn.last, dmap->window_offset,
+                dmap->length);
+       __dmap_remove_busy_list(fcd, dmap);
+       dmap->inode = NULL;
+       dmap->itn.start = dmap->itn.last = 0;
+       __dmap_add_to_free_pool(fcd, dmap);
+}
+
+/*
+ * Free inode dmap entries whose range falls inside [start, end].
+ * Does not take any locks. At this point of time it should only be
+ * called from evict_inode() path where we know all dmap entries can be
+ * reclaimed.
+ */
+static void inode_reclaim_dmap_range(struct fuse_conn_dax *fcd,
+                                    struct inode *inode,
+                                    loff_t start, loff_t end)
+{
+       struct fuse_inode *fi = get_fuse_inode(inode);
+       struct fuse_dax_mapping *dmap, *n;
+       int err, num = 0;
+       LIST_HEAD(to_remove);
+       unsigned long start_idx = start >> FUSE_DAX_SHIFT;
+       unsigned long end_idx = end >> FUSE_DAX_SHIFT;
+       struct interval_tree_node *node;
+
+       while (1) {
+               node = interval_tree_iter_first(&fi->dax->tree, start_idx,
+                                               end_idx);
+               if (!node)
+                       break;
+               dmap = node_to_dmap(node);
+               /* inode is going away. There should not be any users of dmap */
+               WARN_ON(refcount_read(&dmap->refcnt) > 1);
+               interval_tree_remove(&dmap->itn, &fi->dax->tree);
+               num++;
+               list_add(&dmap->list, &to_remove);
+       }
+
+       /* Nothing to remove */
+       if (list_empty(&to_remove))
+               return;
+
+       WARN_ON(fi->dax->nr < num);
+       fi->dax->nr -= num;
+       err = dmap_removemapping_list(inode, num, &to_remove);
+       if (err && err != -ENOTCONN) {
+               pr_warn("Failed to removemappings. start=0x%llx end=0x%llx\n",
+                       start, end);
+       }
+       spin_lock(&fcd->lock);
+       list_for_each_entry_safe(dmap, n, &to_remove, list) {
+               list_del_init(&dmap->list);
+               dmap_reinit_add_to_free_pool(fcd, dmap);
+       }
+       spin_unlock(&fcd->lock);
+}
+
+static int dmap_removemapping_one(struct inode *inode,
+                                 struct fuse_dax_mapping *dmap)
+{
+       struct fuse_removemapping_one forget_one;
+       struct fuse_removemapping_in inarg;
+
+       memset(&inarg, 0, sizeof(inarg));
+       inarg.count = 1;
+       memset(&forget_one, 0, sizeof(forget_one));
+       forget_one.moffset = dmap->window_offset;
+       forget_one.len = dmap->length;
+
+       return fuse_send_removemapping(inode, &inarg, &forget_one);
+}
+
+/*
+ * It is called from evict_inode() and by that time inode is going away. So
+ * this function does not take any locks like fi->dax->sem for traversing
+ * that fuse inode interval tree. If that lock is taken then lock validator
+ * complains of deadlock situation w.r.t fs_reclaim lock.
+ */
+void fuse_dax_inode_cleanup(struct inode *inode)
+{
+       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_inode *fi = get_fuse_inode(inode);
+
+       /*
+        * fuse_evict_inode() has already called truncate_inode_pages_final()
+        * before we arrive here. So we should not have to worry about any
+        * pages/exception entries still associated with inode.
+        */
+       inode_reclaim_dmap_range(fc->dax, inode, 0, -1);
+       WARN_ON(fi->dax->nr);
+}
+
+static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length)
+{
+       iomap->addr = IOMAP_NULL_ADDR;
+       iomap->length = length;
+       iomap->type = IOMAP_HOLE;
+}
+
+static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length,
+                           struct iomap *iomap, struct fuse_dax_mapping *dmap,
+                           unsigned int flags)
+{
+       loff_t offset, len;
+       loff_t i_size = i_size_read(inode);
+
+       offset = pos - (dmap->itn.start << FUSE_DAX_SHIFT);
+       len = min(length, dmap->length - offset);
+
+       /* If length is beyond end of file, truncate further */
+       if (pos + len > i_size)
+               len = i_size - pos;
+
+       if (len > 0) {
+               iomap->addr = dmap->window_offset + offset;
+               iomap->length = len;
+               if (flags & IOMAP_FAULT)
+                       iomap->length = ALIGN(len, PAGE_SIZE);
+               iomap->type = IOMAP_MAPPED;
+               /*
+                * increace refcnt so that reclaim code knows this dmap is in
+                * use. This assumes fi->dax->sem mutex is held either
+                * shared/exclusive.
+                */
+               refcount_inc(&dmap->refcnt);
+
+               /* iomap->private should be NULL */
+               WARN_ON_ONCE(iomap->private);
+               iomap->private = dmap;
+       } else {
+               /* Mapping beyond end of file is hole */
+               fuse_fill_iomap_hole(iomap, length);
+       }
+}
+
+static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos,
+                                     loff_t length, unsigned int flags,
+                                     struct iomap *iomap)
+{
+       struct fuse_inode *fi = get_fuse_inode(inode);
+       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_conn_dax *fcd = fc->dax;
+       struct fuse_dax_mapping *dmap, *alloc_dmap = NULL;
+       int ret;
+       bool writable = flags & IOMAP_WRITE;
+       unsigned long start_idx = pos >> FUSE_DAX_SHIFT;
+       struct interval_tree_node *node;
+
+       /*
+        * Can't do inline reclaim in fault path. We call
+        * dax_layout_busy_page() before we free a range. And
+        * fuse_wait_dax_page() drops fi->i_mmap_sem lock and requires it.
+        * In fault path we enter with fi->i_mmap_sem held and can't drop
+        * it. Also in fault path we hold fi->i_mmap_sem shared and not
+        * exclusive, so that creates further issues with fuse_wait_dax_page().
+        * Hence return -EAGAIN and fuse_dax_fault() will wait for a memory
+        * range to become free and retry.
+        */
+       if (flags & IOMAP_FAULT) {
+               alloc_dmap = alloc_dax_mapping(fcd);
+               if (!alloc_dmap)
+                       return -EAGAIN;
+       } else {
+               alloc_dmap = alloc_dax_mapping_reclaim(fcd, inode);
+               if (IS_ERR(alloc_dmap))
+                       return PTR_ERR(alloc_dmap);
+       }
+
+       /* If we are here, we should have memory allocated */
+       if (WARN_ON(!alloc_dmap))
+               return -EIO;
+
+       /*
+        * Take write lock so that only one caller can try to setup mapping
+        * and other waits.
+        */
+       down_write(&fi->dax->sem);
+       /*
+        * We dropped lock. Check again if somebody else setup
+        * mapping already.
+        */
+       node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+       if (node) {
+               dmap = node_to_dmap(node);
+               fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+               dmap_add_to_free_pool(fcd, alloc_dmap);
+               up_write(&fi->dax->sem);
+               return 0;
+       }
+
+       /* Setup one mapping */
+       ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, alloc_dmap,
+                                    writable, false);
+       if (ret < 0) {
+               dmap_add_to_free_pool(fcd, alloc_dmap);
+               up_write(&fi->dax->sem);
+               return ret;
+       }
+       fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags);
+       up_write(&fi->dax->sem);
+       return 0;
+}
+
+static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos,
+                                   loff_t length, unsigned int flags,
+                                   struct iomap *iomap)
+{
+       struct fuse_inode *fi = get_fuse_inode(inode);
+       struct fuse_dax_mapping *dmap;
+       int ret;
+       unsigned long idx = pos >> FUSE_DAX_SHIFT;
+       struct interval_tree_node *node;
+
+       /*
+        * Take exclusive lock so that only one caller can try to setup
+        * mapping and others wait.
+        */
+       down_write(&fi->dax->sem);
+       node = interval_tree_iter_first(&fi->dax->tree, idx, idx);
+
+       /* We are holding either inode lock or i_mmap_sem, and that should
+        * ensure that dmap can't be truncated. We are holding a reference
+        * on dmap and that should make sure it can't be reclaimed. So dmap
+        * should still be there in tree despite the fact we dropped and
+        * re-acquired the fi->dax->sem lock.
+        */
+       ret = -EIO;
+       if (WARN_ON(!node))
+               goto out_err;
+
+       dmap = node_to_dmap(node);
+
+       /* We took an extra reference on dmap to make sure its not reclaimd.
+        * Now we hold fi->dax->sem lock and that reference is not needed
+        * anymore. Drop it.
+        */
+       if (refcount_dec_and_test(&dmap->refcnt)) {
+               /* refcount should not hit 0. This object only goes
+                * away when fuse connection goes away
+                */
+               WARN_ON_ONCE(1);
+       }
+
+       /* Maybe another thread already upgraded mapping while we were not
+        * holding lock.
+        */
+       if (dmap->writable) {
+               ret = 0;
+               goto out_fill_iomap;
+       }
+
+       ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, dmap, true,
+                                    true);
+       if (ret < 0)
+               goto out_err;
+out_fill_iomap:
+       fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+out_err:
+       up_write(&fi->dax->sem);
+       return ret;
+}
+
+/* This is just for DAX and the mapping is ephemeral, do not use it for other
+ * purposes since there is no block device with a permanent mapping.
+ */
+static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
+                           unsigned int flags, struct iomap *iomap,
+                           struct iomap *srcmap)
+{
+       struct fuse_inode *fi = get_fuse_inode(inode);
+       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_dax_mapping *dmap;
+       bool writable = flags & IOMAP_WRITE;
+       unsigned long start_idx = pos >> FUSE_DAX_SHIFT;
+       struct interval_tree_node *node;
+
+       /* We don't support FIEMAP */
+       if (WARN_ON(flags & IOMAP_REPORT))
+               return -EIO;
+
+       iomap->offset = pos;
+       iomap->flags = 0;
+       iomap->bdev = NULL;
+       iomap->dax_dev = fc->dax->dev;
+
+       /*
+        * Both read/write and mmap path can race here. So we need something
+        * to make sure if we are setting up mapping, then other path waits
+        *
+        * For now, use a semaphore for this. It probably needs to be
+        * optimized later.
+        */
+       down_read(&fi->dax->sem);
+       node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+       if (node) {
+               dmap = node_to_dmap(node);
+               if (writable && !dmap->writable) {
+                       /* Upgrade read-only mapping to read-write. This will
+                        * require exclusive fi->dax->sem lock as we don't want
+                        * two threads to be trying to this simultaneously
+                        * for same dmap. So drop shared lock and acquire
+                        * exclusive lock.
+                        *
+                        * Before dropping fi->dax->sem lock, take reference
+                        * on dmap so that its not freed by range reclaim.
+                        */
+                       refcount_inc(&dmap->refcnt);
+                       up_read(&fi->dax->sem);
+                       pr_debug("%s: Upgrading mapping at offset 0x%llx length 0x%llx\n",
+                                __func__, pos, length);
+                       return fuse_upgrade_dax_mapping(inode, pos, length,
+                                                       flags, iomap);
+               } else {
+                       fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+                       up_read(&fi->dax->sem);
+                       return 0;
+               }
+       } else {
+               up_read(&fi->dax->sem);
+               pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n",
+                               __func__, pos, length);
+               if (pos >= i_size_read(inode))
+                       goto iomap_hole;
+
+               return fuse_setup_new_dax_mapping(inode, pos, length, flags,
+                                                 iomap);
+       }
+
+       /*
+        * If read beyond end of file happnes, fs code seems to return
+        * it as hole
+        */
+iomap_hole:
+       fuse_fill_iomap_hole(iomap, length);
+       pr_debug("%s returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n",
+                __func__, pos, length, iomap->length);
+       return 0;
+}
+
+static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+                         ssize_t written, unsigned int flags,
+                         struct iomap *iomap)
+{
+       struct fuse_dax_mapping *dmap = iomap->private;
+
+       if (dmap) {
+               if (refcount_dec_and_test(&dmap->refcnt)) {
+                       /* refcount should not hit 0. This object only goes
+                        * away when fuse connection goes away
+                        */
+                       WARN_ON_ONCE(1);
+               }
+       }
+
+       /* DAX writes beyond end-of-file aren't handled using iomap, so the
+        * file size is unchanged and there is nothing to do here.
+        */
+       return 0;
+}
+
+static const struct iomap_ops fuse_iomap_ops = {
+       .iomap_begin = fuse_iomap_begin,
+       .iomap_end = fuse_iomap_end,
+};
+
+static void fuse_wait_dax_page(struct inode *inode)
+{
+       struct fuse_inode *fi = get_fuse_inode(inode);
+
+       up_write(&fi->i_mmap_sem);
+       schedule();
+       down_write(&fi->i_mmap_sem);
+}
+
+/* Should be called with fi->i_mmap_sem lock held exclusively */
+static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
+                                   loff_t start, loff_t end)
+{
+       struct page *page;
+
+       page = dax_layout_busy_page_range(inode->i_mapping, start, end);
+       if (!page)
+               return 0;
+
+       *retry = true;
+       return ___wait_var_event(&page->_refcount,
+                       atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
+                       0, 0, fuse_wait_dax_page(inode));
+}
+
+/* dmap_end == 0 leads to unmapping of whole file */
+int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start,
+                                 u64 dmap_end)
+{
+       bool    retry;
+       int     ret;
+
+       do {
+               retry = false;
+               ret = __fuse_dax_break_layouts(inode, &retry, dmap_start,
+                                              dmap_end);
+       } while (ret == 0 && retry);
+
+       return ret;
+}
+
+ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+       struct inode *inode = file_inode(iocb->ki_filp);
+       ssize_t ret;
+
+       if (iocb->ki_flags & IOCB_NOWAIT) {
+               if (!inode_trylock_shared(inode))
+                       return -EAGAIN;
+       } else {
+               inode_lock_shared(inode);
+       }
+
+       ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops);
+       inode_unlock_shared(inode);
+
+       /* TODO file_accessed(iocb->f_filp) */
+       return ret;
+}
+
+static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from)
+{
+       struct inode *inode = file_inode(iocb->ki_filp);
+
+       return (iov_iter_rw(from) == WRITE &&
+               ((iocb->ki_pos) >= i_size_read(inode) ||
+                 (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode))));
+}
+
+static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+       struct inode *inode = file_inode(iocb->ki_filp);
+       struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
+       ssize_t ret;
+
+       ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
+       if (ret < 0)
+               return ret;
+
+       fuse_invalidate_attr(inode);
+       fuse_write_update_size(inode, iocb->ki_pos);
+       return ret;
+}
+
+ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+       struct inode *inode = file_inode(iocb->ki_filp);
+       ssize_t ret;
+
+       if (iocb->ki_flags & IOCB_NOWAIT) {
+               if (!inode_trylock(inode))
+                       return -EAGAIN;
+       } else {
+               inode_lock(inode);
+       }
+
+       ret = generic_write_checks(iocb, from);
+       if (ret <= 0)
+               goto out;
+
+       ret = file_remove_privs(iocb->ki_filp);
+       if (ret)
+               goto out;
+       /* TODO file_update_time() but we don't want metadata I/O */
+
+       /* Do not use dax for file extending writes as write and on
+        * disk i_size increase are not atomic otherwise.
+        */
+       if (file_extending_write(iocb, from))
+               ret = fuse_dax_direct_write(iocb, from);
+       else
+               ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops);
+
+out:
+       inode_unlock(inode);
+
+       if (ret > 0)
+               ret = generic_write_sync(iocb, ret);
+       return ret;
+}
+
+static int fuse_dax_writepages(struct address_space *mapping,
+                              struct writeback_control *wbc)
+{
+
+       struct inode *inode = mapping->host;
+       struct fuse_conn *fc = get_fuse_conn(inode);
+
+       return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc);
+}
+
+static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf,
+                                  enum page_entry_size pe_size, bool write)
+{
+       vm_fault_t ret;
+       struct inode *inode = file_inode(vmf->vma->vm_file);
+       struct super_block *sb = inode->i_sb;
+       pfn_t pfn;
+       int error = 0;
+       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_conn_dax *fcd = fc->dax;
+       bool retry = false;
+
+       if (write)
+               sb_start_pagefault(sb);
+retry:
+       if (retry && !(fcd->nr_free_ranges > 0))
+               wait_event(fcd->range_waitq, (fcd->nr_free_ranges > 0));
+
+       /*
+        * We need to serialize against not only truncate but also against
+        * fuse dax memory range reclaim. While a range is being reclaimed,
+        * we do not want any read/write/mmap to make progress and try
+        * to populate page cache or access memory we are trying to free.
+        */
+       down_read(&get_fuse_inode(inode)->i_mmap_sem);
+       ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops);
+       if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) {
+               error = 0;
+               retry = true;
+               up_read(&get_fuse_inode(inode)->i_mmap_sem);
+               goto retry;
+       }
+
+       if (ret & VM_FAULT_NEEDDSYNC)
+               ret = dax_finish_sync_fault(vmf, pe_size, pfn);
+       up_read(&get_fuse_inode(inode)->i_mmap_sem);
+
+       if (write)
+               sb_end_pagefault(sb);
+
+       return ret;
+}
+
+static vm_fault_t fuse_dax_fault(struct vm_fault *vmf)
+{
+       return __fuse_dax_fault(vmf, PE_SIZE_PTE,
+                               vmf->flags & FAULT_FLAG_WRITE);
+}
+
+static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf,
+                              enum page_entry_size pe_size)
+{
+       return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE);
+}
+
+static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf)
+{
+       return __fuse_dax_fault(vmf, PE_SIZE_PTE, true);
+}
+
+static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf)
+{
+       return __fuse_dax_fault(vmf, PE_SIZE_PTE, true);
+}
+
+static const struct vm_operations_struct fuse_dax_vm_ops = {
+       .fault          = fuse_dax_fault,
+       .huge_fault     = fuse_dax_huge_fault,
+       .page_mkwrite   = fuse_dax_page_mkwrite,
+       .pfn_mkwrite    = fuse_dax_pfn_mkwrite,
+};
+
+int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       file_accessed(file);
+       vma->vm_ops = &fuse_dax_vm_ops;
+       vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+       return 0;
+}
+
+static int dmap_writeback_invalidate(struct inode *inode,
+                                    struct fuse_dax_mapping *dmap)
+{
+       int ret;
+       loff_t start_pos = dmap->itn.start << FUSE_DAX_SHIFT;
+       loff_t end_pos = (start_pos + FUSE_DAX_SZ - 1);
+
+       ret = filemap_fdatawrite_range(inode->i_mapping, start_pos, end_pos);
+       if (ret) {
+               pr_debug("fuse: filemap_fdatawrite_range() failed. err=%d start_pos=0x%llx, end_pos=0x%llx\n",
+                        ret, start_pos, end_pos);
+               return ret;
+       }
+
+       ret = invalidate_inode_pages2_range(inode->i_mapping,
+                                           start_pos >> PAGE_SHIFT,
+                                           end_pos >> PAGE_SHIFT);
+       if (ret)
+               pr_debug("fuse: invalidate_inode_pages2_range() failed err=%d\n",
+                        ret);
+
+       return ret;
+}
+
+static int reclaim_one_dmap_locked(struct inode *inode,
+                                  struct fuse_dax_mapping *dmap)
+{
+       int ret;
+       struct fuse_inode *fi = get_fuse_inode(inode);
+
+       /*
+        * igrab() was done to make sure inode won't go under us, and this
+        * further avoids the race with evict().
+        */
+       ret = dmap_writeback_invalidate(inode, dmap);
+       if (ret)
+               return ret;
+
+       /* Remove dax mapping from inode interval tree now */
+       interval_tree_remove(&dmap->itn, &fi->dax->tree);
+       fi->dax->nr--;
+
+       /* It is possible that umount/shutdown has killed the fuse connection
+        * and worker thread is trying to reclaim memory in parallel.  Don't
+        * warn in that case.
+        */
+       ret = dmap_removemapping_one(inode, dmap);
+       if (ret && ret != -ENOTCONN) {
+               pr_warn("Failed to remove mapping. offset=0x%llx len=0x%llx ret=%d\n",
+                       dmap->window_offset, dmap->length, ret);
+       }
+       return 0;
+}
+
+/* Find first mapped dmap for an inode and return file offset. Caller needs
+ * to hold fi->dax->sem lock either shared or exclusive.
+ */
+static struct fuse_dax_mapping *inode_lookup_first_dmap(struct inode *inode)
+{
+       struct fuse_inode *fi = get_fuse_inode(inode);
+       struct fuse_dax_mapping *dmap;
+       struct interval_tree_node *node;
+
+       for (node = interval_tree_iter_first(&fi->dax->tree, 0, -1); node;
+            node = interval_tree_iter_next(node, 0, -1)) {
+               dmap = node_to_dmap(node);
+               /* still in use. */
+               if (refcount_read(&dmap->refcnt) > 1)
+                       continue;
+
+               return dmap;
+       }
+
+       return NULL;
+}
+
+/*
+ * Find first mapping in the tree and free it and return it. Do not add
+ * it back to free pool.
+ */
+static struct fuse_dax_mapping *
+inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode,
+                             bool *retry)
+{
+       struct fuse_inode *fi = get_fuse_inode(inode);
+       struct fuse_dax_mapping *dmap;
+       u64 dmap_start, dmap_end;
+       unsigned long start_idx;
+       int ret;
+       struct interval_tree_node *node;
+
+       down_write(&fi->i_mmap_sem);
+
+       /* Lookup a dmap and corresponding file offset to reclaim. */
+       down_read(&fi->dax->sem);
+       dmap = inode_lookup_first_dmap(inode);
+       if (dmap) {
+               start_idx = dmap->itn.start;
+               dmap_start = start_idx << FUSE_DAX_SHIFT;
+               dmap_end = dmap_start + FUSE_DAX_SZ - 1;
+       }
+       up_read(&fi->dax->sem);
+
+       if (!dmap)
+               goto out_mmap_sem;
+       /*
+        * Make sure there are no references to inode pages using
+        * get_user_pages()
+        */
+       ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
+       if (ret) {
+               pr_debug("fuse: fuse_dax_break_layouts() failed. err=%d\n",
+                        ret);
+               dmap = ERR_PTR(ret);
+               goto out_mmap_sem;
+       }
+
+       down_write(&fi->dax->sem);
+       node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+       /* Range already got reclaimed by somebody else */
+       if (!node) {
+               if (retry)
+                       *retry = true;
+               goto out_write_dmap_sem;
+       }
+
+       dmap = node_to_dmap(node);
+       /* still in use. */
+       if (refcount_read(&dmap->refcnt) > 1) {
+               dmap = NULL;
+               if (retry)
+                       *retry = true;
+               goto out_write_dmap_sem;
+       }
+
+       ret = reclaim_one_dmap_locked(inode, dmap);
+       if (ret < 0) {
+               dmap = ERR_PTR(ret);
+               goto out_write_dmap_sem;
+       }
+
+       /* Clean up dmap. Do not add back to free list */
+       dmap_remove_busy_list(fcd, dmap);
+       dmap->inode = NULL;
+       dmap->itn.start = dmap->itn.last = 0;
+
+       pr_debug("fuse: %s: inline reclaimed memory range. inode=%p, window_offset=0x%llx, length=0x%llx\n",
+                __func__, inode, dmap->window_offset, dmap->length);
+
+out_write_dmap_sem:
+       up_write(&fi->dax->sem);
+out_mmap_sem:
+       up_write(&fi->i_mmap_sem);
+       return dmap;
+}
+
+static struct fuse_dax_mapping *
+alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode)
+{
+       struct fuse_dax_mapping *dmap;
+       struct fuse_inode *fi = get_fuse_inode(inode);
+
+       while (1) {
+               bool retry = false;
+
+               dmap = alloc_dax_mapping(fcd);
+               if (dmap)
+                       return dmap;
+
+               dmap = inode_inline_reclaim_one_dmap(fcd, inode, &retry);
+               /*
+                * Either we got a mapping or it is an error, return in both
+                * the cases.
+                */
+               if (dmap)
+                       return dmap;
+
+               /* If we could not reclaim a mapping because it
+                * had a reference or some other temporary failure,
+                * Try again. We want to give up inline reclaim only
+                * if there is no range assigned to this node. Otherwise
+                * if a deadlock is possible if we sleep with fi->i_mmap_sem
+                * held and worker to free memory can't make progress due
+                * to unavailability of fi->i_mmap_sem lock. So sleep
+                * only if fi->dax->nr=0
+                */
+               if (retry)
+                       continue;
+               /*
+                * There are no mappings which can be reclaimed. Wait for one.
+                * We are not holding fi->dax->sem. So it is possible
+                * that range gets added now. But as we are not holding
+                * fi->i_mmap_sem, worker should still be able to free up
+                * a range and wake us up.
+                */
+               if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) {
+                       if (wait_event_killable_exclusive(fcd->range_waitq,
+                                       (fcd->nr_free_ranges > 0))) {
+                               return ERR_PTR(-EINTR);
+                       }
+               }
+       }
+}
+
+static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd,
+                                         struct inode *inode,
+                                         unsigned long start_idx)
+{
+       int ret;
+       struct fuse_inode *fi = get_fuse_inode(inode);
+       struct fuse_dax_mapping *dmap;
+       struct interval_tree_node *node;
+
+       /* Find fuse dax mapping at file offset inode. */
+       node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+
+       /* Range already got cleaned up by somebody else */
+       if (!node)
+               return 0;
+       dmap = node_to_dmap(node);
+
+       /* still in use. */
+       if (refcount_read(&dmap->refcnt) > 1)
+               return 0;
+
+       ret = reclaim_one_dmap_locked(inode, dmap);
+       if (ret < 0)
+               return ret;
+
+       /* Cleanup dmap entry and add back to free list */
+       spin_lock(&fcd->lock);
+       dmap_reinit_add_to_free_pool(fcd, dmap);
+       spin_unlock(&fcd->lock);
+       return ret;
+}
+
+/*
+ * Free a range of memory.
+ * Locking:
+ * 1. Take fi->i_mmap_sem to block dax faults.
+ * 2. Take fi->dax->sem to protect interval tree and also to make sure
+ *    read/write can not reuse a dmap which we might be freeing.
+ */
+static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd,
+                                  struct inode *inode,
+                                  unsigned long start_idx,
+                                  unsigned long end_idx)
+{
+       int ret;
+       struct fuse_inode *fi = get_fuse_inode(inode);
+       loff_t dmap_start = start_idx << FUSE_DAX_SHIFT;
+       loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1;
+
+       down_write(&fi->i_mmap_sem);
+       ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
+       if (ret) {
+               pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n",
+                        ret);
+               goto out_mmap_sem;
+       }
+
+       down_write(&fi->dax->sem);
+       ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx);
+       up_write(&fi->dax->sem);
+out_mmap_sem:
+       up_write(&fi->i_mmap_sem);
+       return ret;
+}
+
+static int try_to_free_dmap_chunks(struct fuse_conn_dax *fcd,
+                                  unsigned long nr_to_free)
+{
+       struct fuse_dax_mapping *dmap, *pos, *temp;
+       int ret, nr_freed = 0;
+       unsigned long start_idx = 0, end_idx = 0;
+       struct inode *inode = NULL;
+
+       /* Pick first busy range and free it for now*/
+       while (1) {
+               if (nr_freed >= nr_to_free)
+                       break;
+
+               dmap = NULL;
+               spin_lock(&fcd->lock);
+
+               if (!fcd->nr_busy_ranges) {
+                       spin_unlock(&fcd->lock);
+                       return 0;
+               }
+
+               list_for_each_entry_safe(pos, temp, &fcd->busy_ranges,
+                                               busy_list) {
+                       /* skip this range if it's in use. */
+                       if (refcount_read(&pos->refcnt) > 1)
+                               continue;
+
+                       inode = igrab(pos->inode);
+                       /*
+                        * This inode is going away. That will free
+                        * up all the ranges anyway, continue to
+                        * next range.
+                        */
+                       if (!inode)
+                               continue;
+                       /*
+                        * Take this element off list and add it tail. If
+                        * this element can't be freed, it will help with
+                        * selecting new element in next iteration of loop.
+                        */
+                       dmap = pos;
+                       list_move_tail(&dmap->busy_list, &fcd->busy_ranges);
+                       start_idx = end_idx = dmap->itn.start;
+                       break;
+               }
+               spin_unlock(&fcd->lock);
+               if (!dmap)
+                       return 0;
+
+               ret = lookup_and_reclaim_dmap(fcd, inode, start_idx, end_idx);
+               iput(inode);
+               if (ret)
+                       return ret;
+               nr_freed++;
+       }
+       return 0;
+}
+
+static void fuse_dax_free_mem_worker(struct work_struct *work)
+{
+       int ret;
+       struct fuse_conn_dax *fcd = container_of(work, struct fuse_conn_dax,
+                                                free_work.work);
+       ret = try_to_free_dmap_chunks(fcd, FUSE_DAX_RECLAIM_CHUNK);
+       if (ret) {
+               pr_debug("fuse: try_to_free_dmap_chunks() failed with err=%d\n",
+                        ret);
+       }
+
+       /* If number of free ranges are still below threhold, requeue */
+       kick_dmap_free_worker(fcd, 1);
+}
+
+static void fuse_free_dax_mem_ranges(struct list_head *mem_list)
+{
+       struct fuse_dax_mapping *range, *temp;
+
+       /* Free All allocated elements */
+       list_for_each_entry_safe(range, temp, mem_list, list) {
+               list_del(&range->list);
+               if (!list_empty(&range->busy_list))
+                       list_del(&range->busy_list);
+               kfree(range);
+       }
+}
+
+void fuse_dax_conn_free(struct fuse_conn *fc)
+{
+       if (fc->dax) {
+               fuse_free_dax_mem_ranges(&fc->dax->free_ranges);
+               kfree(fc->dax);
+       }
+}
+
+static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd)
+{
+       long nr_pages, nr_ranges;
+       void *kaddr;
+       pfn_t pfn;
+       struct fuse_dax_mapping *range;
+       int ret, id;
+       size_t dax_size = -1;
+       unsigned long i;
+
+       init_waitqueue_head(&fcd->range_waitq);
+       INIT_LIST_HEAD(&fcd->free_ranges);
+       INIT_LIST_HEAD(&fcd->busy_ranges);
+       INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker);
+
+       id = dax_read_lock();
+       nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), &kaddr,
+                                    &pfn);
+       dax_read_unlock(id);
+       if (nr_pages < 0) {
+               pr_debug("dax_direct_access() returned %ld\n", nr_pages);
+               return nr_pages;
+       }
+
+       nr_ranges = nr_pages/FUSE_DAX_PAGES;
+       pr_debug("%s: dax mapped %ld pages. nr_ranges=%ld\n",
+               __func__, nr_pages, nr_ranges);
+
+       for (i = 0; i < nr_ranges; i++) {
+               range = kzalloc(sizeof(struct fuse_dax_mapping), GFP_KERNEL);
+               ret = -ENOMEM;
+               if (!range)
+                       goto out_err;
+
+               /* TODO: This offset only works if virtio-fs driver is not
+                * having some memory hidden at the beginning. This needs
+                * better handling
+                */
+               range->window_offset = i * FUSE_DAX_SZ;
+               range->length = FUSE_DAX_SZ;
+               INIT_LIST_HEAD(&range->busy_list);
+               refcount_set(&range->refcnt, 1);
+               list_add_tail(&range->list, &fcd->free_ranges);
+       }
+
+       fcd->nr_free_ranges = nr_ranges;
+       fcd->nr_ranges = nr_ranges;
+       return 0;
+out_err:
+       /* Free All allocated elements */
+       fuse_free_dax_mem_ranges(&fcd->free_ranges);
+       return ret;
+}
+
+int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev)
+{
+       struct fuse_conn_dax *fcd;
+       int err;
+
+       if (!dax_dev)
+               return 0;
+
+       fcd = kzalloc(sizeof(*fcd), GFP_KERNEL);
+       if (!fcd)
+               return -ENOMEM;
+
+       spin_lock_init(&fcd->lock);
+       fcd->dev = dax_dev;
+       err = fuse_dax_mem_range_init(fcd);
+       if (err) {
+               kfree(fcd);
+               return err;
+       }
+
+       fc->dax = fcd;
+       return 0;
+}
+
+bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi)
+{
+       struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+       fi->dax = NULL;
+       if (fc->dax) {
+               fi->dax = kzalloc(sizeof(*fi->dax), GFP_KERNEL_ACCOUNT);
+               if (!fi->dax)
+                       return false;
+
+               init_rwsem(&fi->dax->sem);
+               fi->dax->tree = RB_ROOT_CACHED;
+       }
+
+       return true;
+}
+
+static const struct address_space_operations fuse_dax_file_aops  = {
+       .writepages     = fuse_dax_writepages,
+       .direct_IO      = noop_direct_IO,
+       .set_page_dirty = noop_set_page_dirty,
+       .invalidatepage = noop_invalidatepage,
+};
+
+void fuse_dax_inode_init(struct inode *inode)
+{
+       struct fuse_conn *fc = get_fuse_conn(inode);
+
+       if (!fc->dax)
+               return;
+
+       inode->i_flags |= S_DAX;
+       inode->i_data.a_ops = &fuse_dax_file_aops;
+}
+
+bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment)
+{
+       if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) {
+               pr_warn("FUSE: map_alignment %u incompatible with dax mem range size %u\n",
+                       map_alignment, FUSE_DAX_SZ);
+               return false;
+       }
+       return true;
+}
+
+void fuse_dax_cancel_work(struct fuse_conn *fc)
+{
+       struct fuse_conn_dax *fcd = fc->dax;
+
+       if (fcd)
+               cancel_delayed_work_sync(&fcd->free_work);
+
+}
+EXPORT_SYMBOL_GPL(fuse_dax_cancel_work);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c

index 02b3c36..588f8d1 100644 (file)
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -40,20 +40,21 @@ static struct fuse_dev *fuse_get_dev(struct file *file)
         return READ_ONCE(file->private_data);
  }
  
-static void fuse_request_init(struct fuse_req *req)
+static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
  {
         INIT_LIST_HEAD(&req->list);
         INIT_LIST_HEAD(&req->intr_entry);
         init_waitqueue_head(&req->waitq);
         refcount_set(&req->count, 1);
         __set_bit(FR_PENDING, &req->flags);
+       req->fm = fm;
  }
  
-static struct fuse_req *fuse_request_alloc(gfp_t flags)
+static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags)
  {
         struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags);
         if (req)
-               fuse_request_init(req);
+               fuse_request_init(fm, req);
  
         return req;
  }
@@ -100,10 +101,11 @@ static void fuse_drop_waiting(struct fuse_conn *fc)
         }
  }
  
-static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
+static void fuse_put_request(struct fuse_req *req);
  
-static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background)
+static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background)
  {
+       struct fuse_conn *fc = fm->fc;
         struct fuse_req *req;
         int err;
         atomic_inc(&fc->num_waiting);
@@ -125,7 +127,7 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background)
         if (fc->conn_error)
                 goto out;
  
-       req = fuse_request_alloc(GFP_KERNEL);
+       req = fuse_request_alloc(fm, GFP_KERNEL);
         err = -ENOMEM;
         if (!req) {
                 if (for_background)
@@ -143,7 +145,7 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background)
  
         if (unlikely(req->in.h.uid == ((uid_t)-1) ||
                      req->in.h.gid == ((gid_t)-1))) {
-               fuse_put_request(fc, req);
+               fuse_put_request(req);
                 return ERR_PTR(-EOVERFLOW);
         }
         return req;
@@ -153,8 +155,10 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background)
         return ERR_PTR(err);
  }
  
-static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_put_request(struct fuse_req *req)
  {
+       struct fuse_conn *fc = req->fm->fc;
+
         if (refcount_dec_and_test(&req->count)) {
                 if (test_bit(FR_BACKGROUND, &req->flags)) {
                         /*
@@ -273,8 +277,10 @@ static void flush_bg_queue(struct fuse_conn *fc)
   * the 'end' callback is called if given, else the reference to the
   * request is released
   */
-void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_end(struct fuse_req *req)
  {
+       struct fuse_mount *fm = req->fm;
+       struct fuse_conn *fc = fm->fc;
         struct fuse_iqueue *fiq = &fc->iq;
  
         if (test_and_set_bit(FR_FINISHED, &req->flags))
@@ -309,9 +315,9 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
                                 wake_up(&fc->blocked_waitq);
                 }
  
-               if (fc->num_background == fc->congestion_threshold && fc->sb) {
-                       clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
-                       clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+               if (fc->num_background == fc->congestion_threshold && fm->sb) {
+                       clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
+                       clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
                 }
                 fc->num_background--;
                 fc->active_background--;
@@ -323,14 +329,16 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
         }
  
         if (test_bit(FR_ASYNC, &req->flags))
-               req->args->end(fc, req->args, req->out.h.error);
+               req->args->end(fm, req->args, req->out.h.error);
  put_request:
-       fuse_put_request(fc, req);
+       fuse_put_request(req);
  }
  EXPORT_SYMBOL_GPL(fuse_request_end);
  
-static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
+static int queue_interrupt(struct fuse_req *req)
  {
+       struct fuse_iqueue *fiq = &req->fm->fc->iq;
+
         spin_lock(&fiq->lock);
         /* Check for we've sent request to interrupt this req */
         if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) {
@@ -357,8 +365,9 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
         return 0;
  }
  
-static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
+static void request_wait_answer(struct fuse_req *req)
  {
+       struct fuse_conn *fc = req->fm->fc;
         struct fuse_iqueue *fiq = &fc->iq;
         int err;
  
@@ -373,7 +382,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
                 /* matches barrier in fuse_dev_do_read() */
                 smp_mb__after_atomic();
                 if (test_bit(FR_SENT, &req->flags))
-                       queue_interrupt(fiq, req);
+                       queue_interrupt(req);
         }
  
         if (!test_bit(FR_FORCE, &req->flags)) {
@@ -402,9 +411,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
         wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags));
  }
  
-static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+static void __fuse_request_send(struct fuse_req *req)
  {
-       struct fuse_iqueue *fiq = &fc->iq;
+       struct fuse_iqueue *fiq = &req->fm->fc->iq;
  
         BUG_ON(test_bit(FR_BACKGROUND, &req->flags));
         spin_lock(&fiq->lock);
@@ -418,7 +427,7 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
                 __fuse_get_request(req);
                 queue_request_and_unlock(fiq, req);
  
-               request_wait_answer(fc, req);
+               request_wait_answer(req);
                 /* Pairs with smp_wmb() in fuse_request_end() */
                 smp_rmb();
         }
@@ -457,8 +466,10 @@ static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args)
         }
  }
  
-static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_force_creds(struct fuse_req *req)
  {
+       struct fuse_conn *fc = req->fm->fc;
+
         req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid());
         req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid());
         req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
@@ -473,23 +484,24 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args)
                 __set_bit(FR_ASYNC, &req->flags);
  }
  
-ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
+ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args)
  {
+       struct fuse_conn *fc = fm->fc;
         struct fuse_req *req;
         ssize_t ret;
  
         if (args->force) {
                 atomic_inc(&fc->num_waiting);
-               req = fuse_request_alloc(GFP_KERNEL | __GFP_NOFAIL);
+               req = fuse_request_alloc(fm, GFP_KERNEL | __GFP_NOFAIL);
  
                 if (!args->nocreds)
-                       fuse_force_creds(fc, req);
+                       fuse_force_creds(req);
  
                 __set_bit(FR_WAITING, &req->flags);
                 __set_bit(FR_FORCE, &req->flags);
         } else {
                 WARN_ON(args->nocreds);
-               req = fuse_get_req(fc, false);
+               req = fuse_get_req(fm, false);
                 if (IS_ERR(req))
                         return PTR_ERR(req);
         }
@@ -500,20 +512,21 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
  
         if (!args->noreply)
                 __set_bit(FR_ISREPLY, &req->flags);
-       __fuse_request_send(fc, req);
+       __fuse_request_send(req);
         ret = req->out.h.error;
         if (!ret && args->out_argvar) {
                 BUG_ON(args->out_numargs == 0);
                 ret = args->out_args[args->out_numargs - 1].size;
         }
-       fuse_put_request(fc, req);
+       fuse_put_request(req);
  
         return ret;
  }
  
-static bool fuse_request_queue_background(struct fuse_conn *fc,
-                                         struct fuse_req *req)
+static bool fuse_request_queue_background(struct fuse_req *req)
  {
+       struct fuse_mount *fm = req->fm;
+       struct fuse_conn *fc = fm->fc;
         bool queued = false;
  
         WARN_ON(!test_bit(FR_BACKGROUND, &req->flags));
@@ -527,9 +540,9 @@ static bool fuse_request_queue_background(struct fuse_conn *fc,
                 fc->num_background++;
                 if (fc->num_background == fc->max_background)
                         fc->blocked = 1;
-               if (fc->num_background == fc->congestion_threshold && fc->sb) {
-                       set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
-                       set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+               if (fc->num_background == fc->congestion_threshold && fm->sb) {
+                       set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
+                       set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
                 }
                 list_add_tail(&req->list, &fc->bg_queue);
                 flush_bg_queue(fc);
@@ -540,28 +553,28 @@ static bool fuse_request_queue_background(struct fuse_conn *fc,
         return queued;
  }
  
-int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args,
+int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
                             gfp_t gfp_flags)
  {
         struct fuse_req *req;
  
         if (args->force) {
                 WARN_ON(!args->nocreds);
-               req = fuse_request_alloc(gfp_flags);
+               req = fuse_request_alloc(fm, gfp_flags);
                 if (!req)
                         return -ENOMEM;
                 __set_bit(FR_BACKGROUND, &req->flags);
         } else {
                 WARN_ON(args->nocreds);
-               req = fuse_get_req(fc, true);
+               req = fuse_get_req(fm, true);
                 if (IS_ERR(req))
                         return PTR_ERR(req);
         }
  
         fuse_args_to_req(req, args);
  
-       if (!fuse_request_queue_background(fc, req)) {
-               fuse_put_request(fc, req);
+       if (!fuse_request_queue_background(req)) {
+               fuse_put_request(req);
                 return -ENOTCONN;
         }
  
@@ -569,14 +582,14 @@ int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args,
  }
  EXPORT_SYMBOL_GPL(fuse_simple_background);
  
-static int fuse_simple_notify_reply(struct fuse_conn *fc,
+static int fuse_simple_notify_reply(struct fuse_mount *fm,
                                     struct fuse_args *args, u64 unique)
  {
         struct fuse_req *req;
-       struct fuse_iqueue *fiq = &fc->iq;
+       struct fuse_iqueue *fiq = &fm->fc->iq;
         int err = 0;
  
-       req = fuse_get_req(fc, false);
+       req = fuse_get_req(fm, false);
         if (IS_ERR(req))
                 return PTR_ERR(req);
  
@@ -591,7 +604,7 @@ static int fuse_simple_notify_reply(struct fuse_conn *fc,
         } else {
                 err = -ENODEV;
                 spin_unlock(&fiq->lock);
-               fuse_put_request(fc, req);
+               fuse_put_request(req);
         }
  
         return err;
@@ -785,15 +798,16 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
         struct page *newpage;
         struct pipe_buffer *buf = cs->pipebufs;
  
+       get_page(oldpage);
         err = unlock_request(cs->req);
         if (err)
-               return err;
+               goto out_put_old;
  
         fuse_copy_finish(cs);
  
         err = pipe_buf_confirm(cs->pipe, buf);
         if (err)
-               return err;
+               goto out_put_old;
  
         BUG_ON(!cs->nr_segs);
         cs->currbuf = buf;
@@ -833,7 +847,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
         err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL);
         if (err) {
                 unlock_page(newpage);
-               return err;
+               goto out_put_old;
         }
  
         get_page(newpage);
@@ -852,14 +866,19 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
         if (err) {
                 unlock_page(newpage);
                 put_page(newpage);
-               return err;
+               goto out_put_old;
         }
  
         unlock_page(oldpage);
+       /* Drop ref for ap->pages[] array */
         put_page(oldpage);
         cs->len = 0;
  
-       return 0;
+       err = 0;
+out_put_old:
+       /* Drop ref obtained in this function */
+       put_page(oldpage);
+       return err;
  
  out_fallback_unlock:
         unlock_page(newpage);
@@ -868,10 +887,10 @@ out_fallback:
         cs->offset = buf->offset;
  
         err = lock_request(cs->req);
-       if (err)
-               return err;
+       if (!err)
+               err = 1;
  
-       return 1;
+       goto out_put_old;
  }
  
  static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
@@ -883,14 +902,16 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
         if (cs->nr_segs >= cs->pipe->max_usage)
                 return -EIO;
  
+       get_page(page);
         err = unlock_request(cs->req);
-       if (err)
+       if (err) {
+               put_page(page);
                 return err;
+       }
  
         fuse_copy_finish(cs);
  
         buf = cs->pipebufs;
-       get_page(page);
         buf->page = page;
         buf->offset = offset;
         buf->len = count;
@@ -1250,7 +1271,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
                 /* SETXATTR is special, since it may contain too large data */
                 if (args->opcode == FUSE_SETXATTR)
                         req->out.h.error = -E2BIG;
-               fuse_request_end(fc, req);
+               fuse_request_end(req);
                 goto restart;
         }
         spin_lock(&fpq->lock);
@@ -1284,8 +1305,8 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
         /* matches barrier in request_wait_answer() */
         smp_mb__after_atomic();
         if (test_bit(FR_INTERRUPTED, &req->flags))
-               queue_interrupt(fiq, req);
-       fuse_put_request(fc, req);
+               queue_interrupt(req);
+       fuse_put_request(req);
  
         return reqsize;
  
@@ -1293,7 +1314,7 @@ out_end:
         if (!test_bit(FR_PRIVATE, &req->flags))
                 list_del_init(&req->list);
         spin_unlock(&fpq->lock);
-       fuse_request_end(fc, req);
+       fuse_request_end(req);
         return err;
  
   err_unlock:
@@ -1416,11 +1437,8 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
         fuse_copy_finish(cs);
  
         down_read(&fc->killsb);
-       err = -ENOENT;
-       if (fc->sb) {
-               err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
-                                              outarg.off, outarg.len);
-       }
+       err = fuse_reverse_inval_inode(fc, outarg.ino,
+                                      outarg.off, outarg.len);
         up_read(&fc->killsb);
         return err;
  
@@ -1466,9 +1484,7 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
         buf[outarg.namelen] = 0;
  
         down_read(&fc->killsb);
-       err = -ENOENT;
-       if (fc->sb)
-               err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name);
+       err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name);
         up_read(&fc->killsb);
         kfree(buf);
         return err;
@@ -1516,10 +1532,7 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
         buf[outarg.namelen] = 0;
  
         down_read(&fc->killsb);
-       err = -ENOENT;
-       if (fc->sb)
-               err = fuse_reverse_inval_entry(fc->sb, outarg.parent,
-                                              outarg.child, &name);
+       err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name);
         up_read(&fc->killsb);
         kfree(buf);
         return err;
@@ -1561,10 +1574,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
         down_read(&fc->killsb);
  
         err = -ENOENT;
-       if (!fc->sb)
-               goto out_up_killsb;
-
-       inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+       inode = fuse_ilookup(fc, nodeid,  NULL);
         if (!inode)
                 goto out_up_killsb;
  
@@ -1621,7 +1631,7 @@ struct fuse_retrieve_args {
         struct fuse_notify_retrieve_in inarg;
  };
  
-static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_retrieve_end(struct fuse_mount *fm, struct fuse_args *args,
                               int error)
  {
         struct fuse_retrieve_args *ra =
@@ -1631,7 +1641,7 @@ static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args,
         kfree(ra);
  }
  
-static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
+static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
                          struct fuse_notify_retrieve_out *outarg)
  {
         int err;
@@ -1642,6 +1652,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
         unsigned int offset;
         size_t total_len = 0;
         unsigned int num_pages;
+       struct fuse_conn *fc = fm->fc;
         struct fuse_retrieve_args *ra;
         size_t args_size = sizeof(*ra);
         struct fuse_args_pages *ap;
@@ -1703,9 +1714,9 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
         args->in_args[0].value = &ra->inarg;
         args->in_args[1].size = total_len;
  
-       err = fuse_simple_notify_reply(fc, args, outarg->notify_unique);
+       err = fuse_simple_notify_reply(fm, args, outarg->notify_unique);
         if (err)
-               fuse_retrieve_end(fc, args, err);
+               fuse_retrieve_end(fm, args, err);
  
         return err;
  }
@@ -1714,7 +1725,9 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
                                 struct fuse_copy_state *cs)
  {
         struct fuse_notify_retrieve_out outarg;
+       struct fuse_mount *fm;
         struct inode *inode;
+       u64 nodeid;
         int err;
  
         err = -EINVAL;
@@ -1729,14 +1742,12 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
  
         down_read(&fc->killsb);
         err = -ENOENT;
-       if (fc->sb) {
-               u64 nodeid = outarg.nodeid;
+       nodeid = outarg.nodeid;
  
-               inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
-               if (inode) {
-                       err = fuse_retrieve(fc, inode, &outarg);
-                       iput(inode);
-               }
+       inode = fuse_ilookup(fc, nodeid, &fm);
+       if (inode) {
+               err = fuse_retrieve(fm, inode, &outarg);
+               iput(inode);
         }
         up_read(&fc->killsb);
  
@@ -1875,9 +1886,9 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
                 else if (oh.error == -ENOSYS)
                         fc->no_interrupt = 1;
                 else if (oh.error == -EAGAIN)
-                       err = queue_interrupt(&fc->iq, req);
+                       err = queue_interrupt(req);
  
-               fuse_put_request(fc, req);
+               fuse_put_request(req);
  
                 goto copy_finish;
         }
@@ -1907,7 +1918,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
                 list_del_init(&req->list);
         spin_unlock(&fpq->lock);
  
-       fuse_request_end(fc, req);
+       fuse_request_end(req);
  out:
         return err ? err : nbytes;
  
@@ -2045,7 +2056,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait)
  }
  
  /* Abort all requests on the given list (pending or processing) */
-static void end_requests(struct fuse_conn *fc, struct list_head *head)
+static void end_requests(struct list_head *head)
  {
         while (!list_empty(head)) {
                 struct fuse_req *req;
@@ -2053,7 +2064,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
                 req->out.h.error = -ECONNABORTED;
                 clear_bit(FR_SENT, &req->flags);
                 list_del_init(&req->list);
-               fuse_request_end(fc, req);
+               fuse_request_end(req);
         }
  }
  
@@ -2148,7 +2159,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
                 wake_up_all(&fc->blocked_waitq);
                 spin_unlock(&fc->lock);
  
-               end_requests(fc, &to_end);
+               end_requests(&to_end);
         } else {
                 spin_unlock(&fc->lock);
         }
@@ -2178,7 +2189,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
                         list_splice_init(&fpq->processing[i], &to_end);
                 spin_unlock(&fpq->lock);
  
-               end_requests(fc, &to_end);
+               end_requests(&to_end);
  
                 /* Are we the last open device? */
                 if (atomic_dec_and_test(&fc->dev_count)) {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c

index 26f028b..ff7dbeb 100644 (file)
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -10,6 +10,7 @@
  
  #include <linux/pagemap.h>
  #include <linux/file.h>
+#include <linux/fs_context.h>
  #include <linux/sched.h>
  #include <linux/namei.h>
  #include <linux/slab.h>
@@ -196,7 +197,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
  {
         struct inode *inode;
         struct dentry *parent;
-       struct fuse_conn *fc;
+       struct fuse_mount *fm;
         struct fuse_inode *fi;
         int ret;
  
@@ -218,27 +219,29 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
                 if (flags & LOOKUP_RCU)
                         goto out;
  
-               fc = get_fuse_conn(inode);
+               fm = get_fuse_mount(inode);
  
                 forget = fuse_alloc_forget();
                 ret = -ENOMEM;
                 if (!forget)
                         goto out;
  
-               attr_version = fuse_get_attr_version(fc);
+               attr_version = fuse_get_attr_version(fm->fc);
  
                 parent = dget_parent(entry);
-               fuse_lookup_init(fc, &args, get_node_id(d_inode(parent)),
+               fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)),
                                  &entry->d_name, &outarg);
-               ret = fuse_simple_request(fc, &args);
+               ret = fuse_simple_request(fm, &args);
                 dput(parent);
                 /* Zero nodeid is same as -ENOENT */
                 if (!ret && !outarg.nodeid)
                         ret = -ENOENT;
                 if (!ret) {
                         fi = get_fuse_inode(inode);
-                       if (outarg.nodeid != get_node_id(inode)) {
-                               fuse_queue_forget(fc, forget, outarg.nodeid, 1);
+                       if (outarg.nodeid != get_node_id(inode) ||
+                           (bool) IS_AUTOMOUNT(inode) != (bool) (outarg.attr.flags & FUSE_ATTR_SUBMOUNT)) {
+                               fuse_queue_forget(fm->fc, forget,
+                                                 outarg.nodeid, 1);
                                 goto invalid;
                         }
                         spin_lock(&fi->lock);
@@ -298,6 +301,79 @@ static int fuse_dentry_delete(const struct dentry *dentry)
         return time_before64(fuse_dentry_time(dentry), get_jiffies_64());
  }
  
+/*
+ * Create a fuse_mount object with a new superblock (with path->dentry
+ * as the root), and return that mount so it can be auto-mounted on
+ * @path.
+ */
+static struct vfsmount *fuse_dentry_automount(struct path *path)
+{
+       struct fs_context *fsc;
+       struct fuse_mount *parent_fm = get_fuse_mount_super(path->mnt->mnt_sb);
+       struct fuse_conn *fc = parent_fm->fc;
+       struct fuse_mount *fm;
+       struct vfsmount *mnt;
+       struct fuse_inode *mp_fi = get_fuse_inode(d_inode(path->dentry));
+       struct super_block *sb;
+       int err;
+
+       fsc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry);
+       if (IS_ERR(fsc)) {
+               err = PTR_ERR(fsc);
+               goto out;
+       }
+
+       err = -ENOMEM;
+       fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL);
+       if (!fm)
+               goto out_put_fsc;
+
+       refcount_set(&fm->count, 1);
+       fsc->s_fs_info = fm;
+       sb = sget_fc(fsc, NULL, set_anon_super_fc);
+       if (IS_ERR(sb)) {
+               err = PTR_ERR(sb);
+               fuse_mount_put(fm);
+               goto out_put_fsc;
+       }
+       fm->fc = fuse_conn_get(fc);
+
+       /* Initialize superblock, making @mp_fi its root */
+       err = fuse_fill_super_submount(sb, mp_fi);
+       if (err)
+               goto out_put_sb;
+
+       sb->s_flags |= SB_ACTIVE;
+       fsc->root = dget(sb->s_root);
+       /* We are done configuring the superblock, so unlock it */
+       up_write(&sb->s_umount);
+
+       down_write(&fc->killsb);
+       list_add_tail(&fm->fc_entry, &fc->mounts);
+       up_write(&fc->killsb);
+
+       /* Create the submount */
+       mnt = vfs_create_mount(fsc);
+       if (IS_ERR(mnt)) {
+               err = PTR_ERR(mnt);
+               goto out_put_fsc;
+       }
+       mntget(mnt);
+       put_fs_context(fsc);
+       return mnt;
+
+out_put_sb:
+       /*
+        * Only jump here when fsc->root is NULL and sb is still locked
+        * (otherwise put_fs_context() will put the superblock)
+        */
+       deactivate_locked_super(sb);
+out_put_fsc:
+       put_fs_context(fsc);
+out:
+       return ERR_PTR(err);
+}
+
  const struct dentry_operations fuse_dentry_operations = {
         .d_revalidate   = fuse_dentry_revalidate,
         .d_delete       = fuse_dentry_delete,
@@ -305,6 +381,7 @@ const struct dentry_operations fuse_dentry_operations = {
         .d_init         = fuse_dentry_init,
         .d_release      = fuse_dentry_release,
  #endif
+       .d_automount    = fuse_dentry_automount,
  };
  
  const struct dentry_operations fuse_root_dentry_operations = {
@@ -329,7 +406,7 @@ bool fuse_invalid_attr(struct fuse_attr *attr)
  int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name,
                      struct fuse_entry_out *outarg, struct inode **inode)
  {
-       struct fuse_conn *fc = get_fuse_conn_super(sb);
+       struct fuse_mount *fm = get_fuse_mount_super(sb);
         FUSE_ARGS(args);
         struct fuse_forget_link *forget;
         u64 attr_version;
@@ -346,10 +423,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
         if (!forget)
                 goto out;
  
-       attr_version = fuse_get_attr_version(fc);
+       attr_version = fuse_get_attr_version(fm->fc);
  
-       fuse_lookup_init(fc, &args, nodeid, name, outarg);
-       err = fuse_simple_request(fc, &args);
+       fuse_lookup_init(fm->fc, &args, nodeid, name, outarg);
+       err = fuse_simple_request(fm, &args);
         /* Zero nodeid is same as -ENOENT, but with valid timeout */
         if (err || !outarg->nodeid)
                 goto out_put_forget;
@@ -365,7 +442,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
                            attr_version);
         err = -ENOMEM;
         if (!*inode) {
-               fuse_queue_forget(fc, forget, outarg->nodeid, 1);
+               fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1);
                 goto out;
         }
         err = 0;
@@ -434,7 +511,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
  {
         int err;
         struct inode *inode;
-       struct fuse_conn *fc = get_fuse_conn(dir);
+       struct fuse_mount *fm = get_fuse_mount(dir);
         FUSE_ARGS(args);
         struct fuse_forget_link *forget;
         struct fuse_create_in inarg;
@@ -452,11 +529,11 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
                 goto out_err;
  
         err = -ENOMEM;
-       ff = fuse_file_alloc(fc);
+       ff = fuse_file_alloc(fm);
         if (!ff)
                 goto out_put_forget_req;
  
-       if (!fc->dont_mask)
+       if (!fm->fc->dont_mask)
                 mode &= ~current_umask();
  
         flags &= ~O_NOCTTY;
@@ -477,7 +554,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
         args.out_args[0].value = &outentry;
         args.out_args[1].size = sizeof(outopen);
         args.out_args[1].value = &outopen;
-       err = fuse_simple_request(fc, &args);
+       err = fuse_simple_request(fm, &args);
         if (err)
                 goto out_free_ff;
  
@@ -494,7 +571,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
         if (!inode) {
                 flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
                 fuse_sync_release(NULL, ff, flags);
-               fuse_queue_forget(fc, forget, outentry.nodeid, 1);
+               fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1);
                 err = -ENOMEM;
                 goto out_err;
         }
@@ -567,7 +644,7 @@ no_open:
  /*
   * Code shared between mknod, mkdir, symlink and link
   */
-static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
+static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
                             struct inode *dir, struct dentry *entry,
                             umode_t mode)
  {
@@ -586,7 +663,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
         args->out_numargs = 1;
         args->out_args[0].size = sizeof(outarg);
         args->out_args[0].value = &outarg;
-       err = fuse_simple_request(fc, args);
+       err = fuse_simple_request(fm, args);
         if (err)
                 goto out_put_forget_req;
  
@@ -600,7 +677,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
         inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
                           &outarg.attr, entry_attr_timeout(&outarg), 0);
         if (!inode) {
-               fuse_queue_forget(fc, forget, outarg.nodeid, 1);
+               fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1);
                 return -ENOMEM;
         }
         kfree(forget);
@@ -628,10 +705,10 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
                       dev_t rdev)
  {
         struct fuse_mknod_in inarg;
-       struct fuse_conn *fc = get_fuse_conn(dir);
+       struct fuse_mount *fm = get_fuse_mount(dir);
         FUSE_ARGS(args);
  
-       if (!fc->dont_mask)
+       if (!fm->fc->dont_mask)
                 mode &= ~current_umask();
  
         memset(&inarg, 0, sizeof(inarg));
@@ -644,7 +721,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
         args.in_args[0].value = &inarg;
         args.in_args[1].size = entry->d_name.len + 1;
         args.in_args[1].value = entry->d_name.name;
-       return create_new_entry(fc, &args, dir, entry, mode);
+       return create_new_entry(fm, &args, dir, entry, mode);
  }
  
  static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode,
@@ -656,10 +733,10 @@ static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode,
  static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
  {
         struct fuse_mkdir_in inarg;
-       struct fuse_conn *fc = get_fuse_conn(dir);
+       struct fuse_mount *fm = get_fuse_mount(dir);
         FUSE_ARGS(args);
  
-       if (!fc->dont_mask)
+       if (!fm->fc->dont_mask)
                 mode &= ~current_umask();
  
         memset(&inarg, 0, sizeof(inarg));
@@ -671,13 +748,13 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
         args.in_args[0].value = &inarg;
         args.in_args[1].size = entry->d_name.len + 1;
         args.in_args[1].value = entry->d_name.name;
-       return create_new_entry(fc, &args, dir, entry, S_IFDIR);
+       return create_new_entry(fm, &args, dir, entry, S_IFDIR);
  }
  
  static int fuse_symlink(struct inode *dir, struct dentry *entry,
                         const char *link)
  {
-       struct fuse_conn *fc = get_fuse_conn(dir);
+       struct fuse_mount *fm = get_fuse_mount(dir);
         unsigned len = strlen(link) + 1;
         FUSE_ARGS(args);
  
@@ -687,7 +764,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
         args.in_args[0].value = entry->d_name.name;
         args.in_args[1].size = len;
         args.in_args[1].value = link;
-       return create_new_entry(fc, &args, dir, entry, S_IFLNK);
+       return create_new_entry(fm, &args, dir, entry, S_IFLNK);
  }
  
  void fuse_update_ctime(struct inode *inode)
@@ -701,7 +778,7 @@ void fuse_update_ctime(struct inode *inode)
  static int fuse_unlink(struct inode *dir, struct dentry *entry)
  {
         int err;
-       struct fuse_conn *fc = get_fuse_conn(dir);
+       struct fuse_mount *fm = get_fuse_mount(dir);
         FUSE_ARGS(args);
  
         args.opcode = FUSE_UNLINK;
@@ -709,13 +786,13 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
         args.in_numargs = 1;
         args.in_args[0].size = entry->d_name.len + 1;
         args.in_args[0].value = entry->d_name.name;
-       err = fuse_simple_request(fc, &args);
+       err = fuse_simple_request(fm, &args);
         if (!err) {
                 struct inode *inode = d_inode(entry);
                 struct fuse_inode *fi = get_fuse_inode(inode);
  
                 spin_lock(&fi->lock);
-               fi->attr_version = atomic64_inc_return(&fc->attr_version);
+               fi->attr_version = atomic64_inc_return(&fm->fc->attr_version);
                 /*
                  * If i_nlink == 0 then unlink doesn't make sense, yet this can
                  * happen if userspace filesystem is careless.  It would be
@@ -737,7 +814,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
  static int fuse_rmdir(struct inode *dir, struct dentry *entry)
  {
         int err;
-       struct fuse_conn *fc = get_fuse_conn(dir);
+       struct fuse_mount *fm = get_fuse_mount(dir);
         FUSE_ARGS(args);
  
         args.opcode = FUSE_RMDIR;
@@ -745,7 +822,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
         args.in_numargs = 1;
         args.in_args[0].size = entry->d_name.len + 1;
         args.in_args[0].value = entry->d_name.name;
-       err = fuse_simple_request(fc, &args);
+       err = fuse_simple_request(fm, &args);
         if (!err) {
                 clear_nlink(d_inode(entry));
                 fuse_dir_changed(dir);
@@ -761,7 +838,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
  {
         int err;
         struct fuse_rename2_in inarg;
-       struct fuse_conn *fc = get_fuse_conn(olddir);
+       struct fuse_mount *fm = get_fuse_mount(olddir);
         FUSE_ARGS(args);
  
         memset(&inarg, 0, argsize);
@@ -776,7 +853,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
         args.in_args[1].value = oldent->d_name.name;
         args.in_args[2].size = newent->d_name.len + 1;
         args.in_args[2].value = newent->d_name.name;
-       err = fuse_simple_request(fc, &args);
+       err = fuse_simple_request(fm, &args);
         if (!err) {
                 /* ctime changes */
                 fuse_invalidate_attr(d_inode(oldent));
@@ -847,7 +924,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
         int err;
         struct fuse_link_in inarg;
         struct inode *inode = d_inode(entry);
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
         FUSE_ARGS(args);
  
         memset(&inarg, 0, sizeof(inarg));
@@ -858,7 +935,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
         args.in_args[0].value = &inarg;
         args.in_args[1].size = newent->d_name.len + 1;
         args.in_args[1].value = newent->d_name.name;
-       err = create_new_entry(fc, &args, newdir, newent, inode->i_mode);
+       err = create_new_entry(fm, &args, newdir, newent, inode->i_mode);
         /* Contrary to "normal" filesystems it can happen that link
            makes two "logical" inodes point to the same "physical"
            inode.  We invalidate the attributes of the old one, so it
@@ -869,7 +946,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
                 struct fuse_inode *fi = get_fuse_inode(inode);
  
                 spin_lock(&fi->lock);
-               fi->attr_version = atomic64_inc_return(&fc->attr_version);
+               fi->attr_version = atomic64_inc_return(&fm->fc->attr_version);
                 if (likely(inode->i_nlink < UINT_MAX))
                         inc_nlink(inode);
                 spin_unlock(&fi->lock);
@@ -926,11 +1003,11 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
         int err;
         struct fuse_getattr_in inarg;
         struct fuse_attr_out outarg;
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
         FUSE_ARGS(args);
         u64 attr_version;
  
-       attr_version = fuse_get_attr_version(fc);
+       attr_version = fuse_get_attr_version(fm->fc);
  
         memset(&inarg, 0, sizeof(inarg));
         memset(&outarg, 0, sizeof(outarg));
@@ -949,7 +1026,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
         args.out_numargs = 1;
         args.out_args[0].size = sizeof(outarg);
         args.out_args[0].value = &outarg;
-       err = fuse_simple_request(fc, &args);
+       err = fuse_simple_request(fm, &args);
         if (!err) {
                 if (fuse_invalid_attr(&outarg.attr) ||
                     (inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
@@ -1002,7 +1079,7 @@ int fuse_update_attributes(struct inode *inode, struct file *file)
                                     STATX_BASIC_STATS & ~STATX_ATIME, 0);
  }
  
-int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
                              u64 child_nodeid, struct qstr *name)
  {
         int err = -ENOTDIR;
@@ -1010,7 +1087,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
         struct dentry *dir;
         struct dentry *entry;
  
-       parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid);
+       parent = fuse_ilookup(fc, parent_nodeid, NULL);
         if (!parent)
                 return -ENOENT;
  
@@ -1102,14 +1179,14 @@ int fuse_allow_current_process(struct fuse_conn *fc)
  
  static int fuse_access(struct inode *inode, int mask)
  {
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
         FUSE_ARGS(args);
         struct fuse_access_in inarg;
         int err;
  
         BUG_ON(mask & MAY_NOT_BLOCK);
  
-       if (fc->no_access)
+       if (fm->fc->no_access)
                 return 0;
  
         memset(&inarg, 0, sizeof(inarg));
@@ -1119,9 +1196,9 @@ static int fuse_access(struct inode *inode, int mask)
         args.in_numargs = 1;
         args.in_args[0].size = sizeof(inarg);
         args.in_args[0].value = &inarg;
-       err = fuse_simple_request(fc, &args);
+       err = fuse_simple_request(fm, &args);
         if (err == -ENOSYS) {
-               fc->no_access = 1;
+               fm->fc->no_access = 1;
                 err = 0;
         }
         return err;
@@ -1209,7 +1286,7 @@ static int fuse_permission(struct inode *inode, int mask)
  
  static int fuse_readlink_page(struct inode *inode, struct page *page)
  {
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
         struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 };
         struct fuse_args_pages ap = {
                 .num_pages = 1,
@@ -1226,7 +1303,7 @@ static int fuse_readlink_page(struct inode *inode, struct page *page)
         ap.args.page_zeroing = true;
         ap.args.out_numargs = 1;
         ap.args.out_args[0].size = desc.length;
-       res = fuse_simple_request(fc, &ap.args);
+       res = fuse_simple_request(fm, &ap.args);
  
         fuse_invalidate_atime(inode);
  
@@ -1454,7 +1531,7 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args,
   */
  int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
  {
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
         FUSE_ARGS(args);
         struct fuse_setattr_in inarg;
         struct fuse_attr_out outarg;
@@ -1465,7 +1542,7 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
         inarg.valid = FATTR_MTIME;
         inarg.mtime = inode->i_mtime.tv_sec;
         inarg.mtimensec = inode->i_mtime.tv_nsec;
-       if (fc->minor >= 23) {
+       if (fm->fc->minor >= 23) {
                 inarg.valid |= FATTR_CTIME;
                 inarg.ctime = inode->i_ctime.tv_sec;
                 inarg.ctimensec = inode->i_ctime.tv_nsec;
@@ -1474,9 +1551,9 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
                 inarg.valid |= FATTR_FH;
                 inarg.fh = ff->fh;
         }
-       fuse_setattr_fill(fc, &args, inode, &inarg, &outarg);
+       fuse_setattr_fill(fm->fc, &args, inode, &inarg, &outarg);
  
-       return fuse_simple_request(fc, &args);
+       return fuse_simple_request(fm, &args);
  }
  
  /*
@@ -1491,7 +1568,8 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
                     struct file *file)
  {
         struct inode *inode = d_inode(dentry);
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
+       struct fuse_conn *fc = fm->fc;
         struct fuse_inode *fi = get_fuse_inode(inode);
         FUSE_ARGS(args);
         struct fuse_setattr_in inarg;
@@ -1501,6 +1579,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
         loff_t oldsize;
         int err;
         bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode);
+       bool fault_blocked = false;
  
         if (!fc->default_permissions)
                 attr->ia_valid |= ATTR_FORCE;
@@ -1509,6 +1588,22 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
         if (err)
                 return err;
  
+       if (attr->ia_valid & ATTR_SIZE) {
+               if (WARN_ON(!S_ISREG(inode->i_mode)))
+                       return -EIO;
+               is_truncate = true;
+       }
+
+       if (FUSE_IS_DAX(inode) && is_truncate) {
+               down_write(&fi->i_mmap_sem);
+               fault_blocked = true;
+               err = fuse_dax_break_layouts(inode, 0, 0);
+               if (err) {
+                       up_write(&fi->i_mmap_sem);
+                       return err;
+               }
+       }
+
         if (attr->ia_valid & ATTR_OPEN) {
                 /* This is coming from open(..., ... | O_TRUNC); */
                 WARN_ON(!(attr->ia_valid & ATTR_SIZE));
@@ -1521,17 +1616,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
                          */
                         i_size_write(inode, 0);
                         truncate_pagecache(inode, 0);
-                       return 0;
+                       goto out;
                 }
                 file = NULL;
         }
  
-       if (attr->ia_valid & ATTR_SIZE) {
-               if (WARN_ON(!S_ISREG(inode->i_mode)))
-                       return -EIO;
-               is_truncate = true;
-       }
-
         /* Flush dirty data/metadata before non-truncate SETATTR */
         if (is_wb && S_ISREG(inode->i_mode) &&
             attr->ia_valid &
@@ -1566,7 +1655,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
                 inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
         }
         fuse_setattr_fill(fc, &args, inode, &inarg, &outarg);
-       err = fuse_simple_request(fc, &args);
+       err = fuse_simple_request(fm, &args);
         if (err) {
                 if (err == -EINTR)
                         fuse_invalidate_attr(inode);
@@ -1614,6 +1703,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
         }
  
         clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+out:
+       if (fault_blocked)
+               up_write(&fi->i_mmap_sem);
+
         return 0;
  
  error:
@@ -1621,6 +1714,9 @@ error:
                 fuse_release_nowrite(inode);
  
         clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
+       if (fault_blocked)
+               up_write(&fi->i_mmap_sem);
         return err;
  }
  
diff --git a/fs/fuse/file.c b/fs/fuse/file.c

index 43c165e..c03034e 100644 (file)
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -32,7 +32,7 @@ static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
         return pages;
  }
  
-static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
                           int opcode, struct fuse_open_out *outargp)
  {
         struct fuse_open_in inarg;
@@ -40,7 +40,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
  
         memset(&inarg, 0, sizeof(inarg));
         inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
-       if (!fc->atomic_o_trunc)
+       if (!fm->fc->atomic_o_trunc)
                 inarg.flags &= ~O_TRUNC;
         args.opcode = opcode;
         args.nodeid = nodeid;
@@ -51,7 +51,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
         args.out_args[0].size = sizeof(*outargp);
         args.out_args[0].value = outargp;
  
-       return fuse_simple_request(fc, &args);
+       return fuse_simple_request(fm, &args);
  }
  
  struct fuse_release_args {
@@ -60,7 +60,7 @@ struct fuse_release_args {
         struct inode *inode;
  };
  
-struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
+struct fuse_file *fuse_file_alloc(struct fuse_mount *fm)
  {
         struct fuse_file *ff;
  
@@ -68,7 +68,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
         if (unlikely(!ff))
                 return NULL;
  
-       ff->fc = fc;
+       ff->fm = fm;
         ff->release_args = kzalloc(sizeof(*ff->release_args),
                                    GFP_KERNEL_ACCOUNT);
         if (!ff->release_args) {
@@ -82,7 +82,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
         RB_CLEAR_NODE(&ff->polled_node);
         init_waitqueue_head(&ff->poll_wait);
  
-       ff->kh = atomic64_inc_return(&fc->khctr);
+       ff->kh = atomic64_inc_return(&fm->fc->khctr);
  
         return ff;
  }
@@ -100,7 +100,7 @@ static struct fuse_file *fuse_file_get(struct fuse_file *ff)
         return ff;
  }
  
-static void fuse_release_end(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args,
                              int error)
  {
         struct fuse_release_args *ra = container_of(args, typeof(*ra), args);
@@ -114,29 +114,30 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
         if (refcount_dec_and_test(&ff->count)) {
                 struct fuse_args *args = &ff->release_args->args;
  
-               if (isdir ? ff->fc->no_opendir : ff->fc->no_open) {
+               if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) {
                         /* Do nothing when client does not implement 'open' */
-                       fuse_release_end(ff->fc, args, 0);
+                       fuse_release_end(ff->fm, args, 0);
                 } else if (sync) {
-                       fuse_simple_request(ff->fc, args);
-                       fuse_release_end(ff->fc, args, 0);
+                       fuse_simple_request(ff->fm, args);
+                       fuse_release_end(ff->fm, args, 0);
                 } else {
                         args->end = fuse_release_end;
-                       if (fuse_simple_background(ff->fc, args,
+                       if (fuse_simple_background(ff->fm, args,
                                                    GFP_KERNEL | __GFP_NOFAIL))
-                               fuse_release_end(ff->fc, args, -ENOTCONN);
+                               fuse_release_end(ff->fm, args, -ENOTCONN);
                 }
                 kfree(ff);
         }
  }
  
-int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
                  bool isdir)
  {
+       struct fuse_conn *fc = fm->fc;
         struct fuse_file *ff;
         int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
  
-       ff = fuse_file_alloc(fc);
+       ff = fuse_file_alloc(fm);
         if (!ff)
                 return -ENOMEM;
  
@@ -147,7 +148,7 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
                 struct fuse_open_out outarg;
                 int err;
  
-               err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
+               err = fuse_send_open(fm, nodeid, file, opcode, &outarg);
                 if (!err) {
                         ff->fh = outarg.fh;
                         ff->open_flags = outarg.open_flags;
@@ -216,27 +217,40 @@ void fuse_finish_open(struct inode *inode, struct file *file)
  
  int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
  {
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
+       struct fuse_conn *fc = fm->fc;
         int err;
         bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
                           fc->atomic_o_trunc &&
                           fc->writeback_cache;
+       bool dax_truncate = (file->f_flags & O_TRUNC) &&
+                         fc->atomic_o_trunc && FUSE_IS_DAX(inode);
  
         err = generic_file_open(inode, file);
         if (err)
                 return err;
  
-       if (is_wb_truncate) {
+       if (is_wb_truncate || dax_truncate) {
                 inode_lock(inode);
                 fuse_set_nowrite(inode);
         }
  
-       err = fuse_do_open(fc, get_node_id(inode), file, isdir);
+       if (dax_truncate) {
+               down_write(&get_fuse_inode(inode)->i_mmap_sem);
+               err = fuse_dax_break_layouts(inode, 0, 0);
+               if (err)
+                       goto out;
+       }
  
+       err = fuse_do_open(fm, get_node_id(inode), file, isdir);
         if (!err)
                 fuse_finish_open(inode, file);
  
-       if (is_wb_truncate) {
+out:
+       if (dax_truncate)
+               up_write(&get_fuse_inode(inode)->i_mmap_sem);
+
+       if (is_wb_truncate | dax_truncate) {
                 fuse_release_nowrite(inode);
                 inode_unlock(inode);
         }
@@ -247,7 +261,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
  static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
                                  int flags, int opcode)
  {
-       struct fuse_conn *fc = ff->fc;
+       struct fuse_conn *fc = ff->fm->fc;
         struct fuse_release_args *ra = ff->release_args;
  
         /* Inode is NULL on error path of fuse_create_open() */
@@ -285,7 +299,7 @@ void fuse_release_common(struct file *file, bool isdir)
  
         if (ff->flock) {
                 ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
-               ra->inarg.lock_owner = fuse_lock_owner_id(ff->fc,
+               ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc,
                                                           (fl_owner_t) file);
         }
         /* Hold inode until release is finished */
@@ -300,7 +314,7 @@ void fuse_release_common(struct file *file, bool isdir)
          * synchronous RELEASE is allowed (and desirable) in this case
          * because the server can be trusted not to screw up.
          */
-       fuse_file_put(ff, ff->fc->destroy, isdir);
+       fuse_file_put(ff, ff->fm->fc->destroy, isdir);
  }
  
  static int fuse_open(struct inode *inode, struct file *file)
@@ -443,7 +457,7 @@ static void fuse_sync_writes(struct inode *inode)
  static int fuse_flush(struct file *file, fl_owner_t id)
  {
         struct inode *inode = file_inode(file);
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
         struct fuse_file *ff = file->private_data;
         struct fuse_flush_in inarg;
         FUSE_ARGS(args);
@@ -465,12 +479,12 @@ static int fuse_flush(struct file *file, fl_owner_t id)
                 return err;
  
         err = 0;
-       if (fc->no_flush)
+       if (fm->fc->no_flush)
                 goto inval_attr_out;
  
         memset(&inarg, 0, sizeof(inarg));
         inarg.fh = ff->fh;
-       inarg.lock_owner = fuse_lock_owner_id(fc, id);
+       inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
         args.opcode = FUSE_FLUSH;
         args.nodeid = get_node_id(inode);
         args.in_numargs = 1;
@@ -478,9 +492,9 @@ static int fuse_flush(struct file *file, fl_owner_t id)
         args.in_args[0].value = &inarg;
         args.force = true;
  
-       err = fuse_simple_request(fc, &args);
+       err = fuse_simple_request(fm, &args);
         if (err == -ENOSYS) {
-               fc->no_flush = 1;
+               fm->fc->no_flush = 1;
                 err = 0;
         }
  
@@ -489,7 +503,7 @@ inval_attr_out:
          * In memory i_blocks is not maintained by fuse, if writeback cache is
          * enabled, i_blocks from cached attr may not be accurate.
          */
-       if (!err && fc->writeback_cache)
+       if (!err && fm->fc->writeback_cache)
                 fuse_invalidate_attr(inode);
         return err;
  }
@@ -498,7 +512,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
                       int datasync, int opcode)
  {
         struct inode *inode = file->f_mapping->host;
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
         struct fuse_file *ff = file->private_data;
         FUSE_ARGS(args);
         struct fuse_fsync_in inarg;
@@ -511,7 +525,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
         args.in_numargs = 1;
         args.in_args[0].size = sizeof(inarg);
         args.in_args[0].value = &inarg;
-       return fuse_simple_request(fc, &args);
+       return fuse_simple_request(fm, &args);
  }
  
  static int fuse_fsync(struct file *file, loff_t start, loff_t end,
@@ -686,7 +700,7 @@ static void fuse_io_free(struct fuse_io_args *ia)
         kfree(ia);
  }
  
-static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args,
                                   int err)
  {
         struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
@@ -715,7 +729,7 @@ static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args,
         fuse_io_free(ia);
  }
  
-static ssize_t fuse_async_req_send(struct fuse_conn *fc,
+static ssize_t fuse_async_req_send(struct fuse_mount *fm,
                                    struct fuse_io_args *ia, size_t num_bytes)
  {
         ssize_t err;
@@ -729,9 +743,9 @@ static ssize_t fuse_async_req_send(struct fuse_conn *fc,
  
         ia->ap.args.end = fuse_aio_complete_req;
         ia->ap.args.may_block = io->should_dirty;
-       err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL);
+       err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL);
         if (err)
-               fuse_aio_complete_req(fc, &ia->ap.args, err);
+               fuse_aio_complete_req(fm, &ia->ap.args, err);
  
         return num_bytes;
  }
@@ -741,18 +755,18 @@ static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count,
  {
         struct file *file = ia->io->iocb->ki_filp;
         struct fuse_file *ff = file->private_data;
-       struct fuse_conn *fc = ff->fc;
+       struct fuse_mount *fm = ff->fm;
  
         fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
         if (owner != NULL) {
                 ia->read.in.read_flags |= FUSE_READ_LOCKOWNER;
-               ia->read.in.lock_owner = fuse_lock_owner_id(fc, owner);
+               ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner);
         }
  
         if (ia->io->async)
-               return fuse_async_req_send(fc, ia, count);
+               return fuse_async_req_send(fm, ia, count);
  
-       return fuse_simple_request(fc, &ia->ap.args);
+       return fuse_simple_request(fm, &ia->ap.args);
  }
  
  static void fuse_read_update_size(struct inode *inode, loff_t size,
@@ -798,7 +812,7 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
  static int fuse_do_readpage(struct file *file, struct page *page)
  {
         struct inode *inode = page->mapping->host;
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
         loff_t pos = page_offset(page);
         struct fuse_page_desc desc = { .length = PAGE_SIZE };
         struct fuse_io_args ia = {
@@ -818,14 +832,14 @@ static int fuse_do_readpage(struct file *file, struct page *page)
          */
         fuse_wait_on_page_writeback(inode, page->index);
  
-       attr_ver = fuse_get_attr_version(fc);
+       attr_ver = fuse_get_attr_version(fm->fc);
  
         /* Don't overflow end offset */
         if (pos + (desc.length - 1) == LLONG_MAX)
                 desc.length--;
  
         fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
-       res = fuse_simple_request(fc, &ia.ap.args);
+       res = fuse_simple_request(fm, &ia.ap.args);
         if (res < 0)
                 return res;
         /*
@@ -855,7 +869,7 @@ static int fuse_readpage(struct file *file, struct page *page)
         return err;
  }
  
-static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
                                int err)
  {
         int i;
@@ -899,7 +913,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args,
  static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
  {
         struct fuse_file *ff = file->private_data;
-       struct fuse_conn *fc = ff->fc;
+       struct fuse_mount *fm = ff->fm;
         struct fuse_args_pages *ap = &ia->ap;
         loff_t pos = page_offset(ap->pages[0]);
         size_t count = ap->num_pages << PAGE_SHIFT;
@@ -918,18 +932,18 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
         WARN_ON((loff_t) (pos + count) < 0);
  
         fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
-       ia->read.attr_ver = fuse_get_attr_version(fc);
-       if (fc->async_read) {
+       ia->read.attr_ver = fuse_get_attr_version(fm->fc);
+       if (fm->fc->async_read) {
                 ia->ff = fuse_file_get(ff);
                 ap->args.end = fuse_readpages_end;
-               err = fuse_simple_background(fc, &ap->args, GFP_KERNEL);
+               err = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
                 if (!err)
                         return;
         } else {
-               res = fuse_simple_request(fc, &ap->args);
+               res = fuse_simple_request(fm, &ap->args);
                 err = res < 0 ? res : 0;
         }
-       fuse_readpages_end(fc, &ap->args, err);
+       fuse_readpages_end(fm, &ap->args, err);
  }
  
  static void fuse_readahead(struct readahead_control *rac)
@@ -1000,7 +1014,7 @@ static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff,
         args->opcode = FUSE_WRITE;
         args->nodeid = ff->nodeid;
         args->in_numargs = 2;
-       if (ff->fc->minor < 9)
+       if (ff->fm->fc->minor < 9)
                 args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
         else
                 args->in_args[0].size = sizeof(ia->write.in);
@@ -1029,7 +1043,7 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
         struct kiocb *iocb = ia->io->iocb;
         struct file *file = iocb->ki_filp;
         struct fuse_file *ff = file->private_data;
-       struct fuse_conn *fc = ff->fc;
+       struct fuse_mount *fm = ff->fm;
         struct fuse_write_in *inarg = &ia->write.in;
         ssize_t err;
  
@@ -1037,13 +1051,13 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
         inarg->flags = fuse_write_flags(iocb);
         if (owner != NULL) {
                 inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
-               inarg->lock_owner = fuse_lock_owner_id(fc, owner);
+               inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner);
         }
  
         if (ia->io->async)
-               return fuse_async_req_send(fc, ia, count);
+               return fuse_async_req_send(fm, ia, count);
  
-       err = fuse_simple_request(fc, &ia->ap.args);
+       err = fuse_simple_request(fm, &ia->ap.args);
         if (!err && ia->write.out.size > count)
                 err = -EIO;
  
@@ -1074,7 +1088,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
         struct fuse_args_pages *ap = &ia->ap;
         struct file *file = iocb->ki_filp;
         struct fuse_file *ff = file->private_data;
-       struct fuse_conn *fc = ff->fc;
+       struct fuse_mount *fm = ff->fm;
         unsigned int offset, i;
         int err;
  
@@ -1084,7 +1098,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
         fuse_write_args_fill(ia, ff, pos, count);
         ia->write.in.flags = fuse_write_flags(iocb);
  
-       err = fuse_simple_request(fc, &ap->args);
+       err = fuse_simple_request(fm, &ap->args);
         if (!err && ia->write.out.size > count)
                 err = -EIO;
  
@@ -1399,7 +1413,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
         struct file *file = io->iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
         struct fuse_file *ff = file->private_data;
-       struct fuse_conn *fc = ff->fc;
+       struct fuse_conn *fc = ff->fm->fc;
         size_t nmax = write ? fc->max_write : fc->max_read;
         loff_t pos = *ppos;
         size_t count = iov_iter_count(iter);
@@ -1539,10 +1553,14 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
  {
         struct file *file = iocb->ki_filp;
         struct fuse_file *ff = file->private_data;
+       struct inode *inode = file_inode(file);
  
-       if (is_bad_inode(file_inode(file)))
+       if (is_bad_inode(inode))
                 return -EIO;
  
+       if (FUSE_IS_DAX(inode))
+               return fuse_dax_read_iter(iocb, to);
+
         if (!(ff->open_flags & FOPEN_DIRECT_IO))
                 return fuse_cache_read_iter(iocb, to);
         else
@@ -1553,10 +1571,14 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
  {
         struct file *file = iocb->ki_filp;
         struct fuse_file *ff = file->private_data;
+       struct inode *inode = file_inode(file);
  
-       if (is_bad_inode(file_inode(file)))
+       if (is_bad_inode(inode))
                 return -EIO;
  
+       if (FUSE_IS_DAX(inode))
+               return fuse_dax_write_iter(iocb, from);
+
         if (!(ff->open_flags & FOPEN_DIRECT_IO))
                 return fuse_cache_write_iter(iocb, from);
         else
@@ -1578,7 +1600,7 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa)
         kfree(wpa);
  }
  
-static void fuse_writepage_finish(struct fuse_conn *fc,
+static void fuse_writepage_finish(struct fuse_mount *fm,
                                   struct fuse_writepage_args *wpa)
  {
         struct fuse_args_pages *ap = &wpa->ia.ap;
@@ -1596,7 +1618,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc,
  }
  
  /* Called under fi->lock, may release and reacquire it */
-static void fuse_send_writepage(struct fuse_conn *fc,
+static void fuse_send_writepage(struct fuse_mount *fm,
                                 struct fuse_writepage_args *wpa, loff_t size)
  __releases(fi->lock)
  __acquires(fi->lock)
@@ -1622,10 +1644,10 @@ __acquires(fi->lock)
         args->force = true;
         args->nocreds = true;
  
-       err = fuse_simple_background(fc, args, GFP_ATOMIC);
+       err = fuse_simple_background(fm, args, GFP_ATOMIC);
         if (err == -ENOMEM) {
                 spin_unlock(&fi->lock);
-               err = fuse_simple_background(fc, args, GFP_NOFS | __GFP_NOFAIL);
+               err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL);
                 spin_lock(&fi->lock);
         }
  
@@ -1638,7 +1660,7 @@ __acquires(fi->lock)
   out_free:
         fi->writectr--;
         rb_erase(&wpa->writepages_entry, &fi->writepages);
-       fuse_writepage_finish(fc, wpa);
+       fuse_writepage_finish(fm, wpa);
         spin_unlock(&fi->lock);
  
         /* After fuse_writepage_finish() aux request list is private */
@@ -1662,7 +1684,7 @@ void fuse_flush_writepages(struct inode *inode)
  __releases(fi->lock)
  __acquires(fi->lock)
  {
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
         struct fuse_inode *fi = get_fuse_inode(inode);
         loff_t crop = i_size_read(inode);
         struct fuse_writepage_args *wpa;
@@ -1671,7 +1693,7 @@ __acquires(fi->lock)
                 wpa = list_entry(fi->queued_writes.next,
                                  struct fuse_writepage_args, queue_entry);
                 list_del_init(&wpa->queue_entry);
-               fuse_send_writepage(fc, wpa, crop);
+               fuse_send_writepage(fm, wpa, crop);
         }
  }
  
@@ -1712,7 +1734,7 @@ static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
         WARN_ON(fuse_insert_writeback(root, wpa));
  }
  
-static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
                                int error)
  {
         struct fuse_writepage_args *wpa =
@@ -1724,7 +1746,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
         spin_lock(&fi->lock);
         rb_erase(&wpa->writepages_entry, &fi->writepages);
         while (wpa->next) {
-               struct fuse_conn *fc = get_fuse_conn(inode);
+               struct fuse_mount *fm = get_fuse_mount(inode);
                 struct fuse_write_in *inarg = &wpa->ia.write.in;
                 struct fuse_writepage_args *next = wpa->next;
  
@@ -1756,10 +1778,10 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
                  * no invocations of fuse_writepage_end() while we're in
                  * fuse_set_nowrite..fuse_release_nowrite section.
                  */
-               fuse_send_writepage(fc, next, inarg->offset + inarg->size);
+               fuse_send_writepage(fm, next, inarg->offset + inarg->size);
         }
         fi->writectr--;
-       fuse_writepage_finish(fc, wpa);
+       fuse_writepage_finish(fm, wpa);
         spin_unlock(&fi->lock);
         fuse_writepage_free(wpa);
  }
@@ -2317,6 +2339,10 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
  {
         struct fuse_file *ff = file->private_data;
  
+       /* DAX mmap is superior to direct_io mmap */
+       if (FUSE_IS_DAX(file_inode(file)))
+               return fuse_dax_mmap(file, vma);
+
         if (ff->open_flags & FOPEN_DIRECT_IO) {
                 /* Can't provide the coherency needed for MAP_SHARED */
                 if (vma->vm_flags & VM_MAYSHARE)
@@ -2395,7 +2421,7 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file,
  static int fuse_getlk(struct file *file, struct file_lock *fl)
  {
         struct inode *inode = file_inode(file);
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
         FUSE_ARGS(args);
         struct fuse_lk_in inarg;
         struct fuse_lk_out outarg;
@@ -2405,9 +2431,9 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
         args.out_numargs = 1;
         args.out_args[0].size = sizeof(outarg);
         args.out_args[0].value = &outarg;
-       err = fuse_simple_request(fc, &args);
+       err = fuse_simple_request(fm, &args);
         if (!err)
-               err = convert_fuse_file_lock(fc, &outarg.lk, fl);
+               err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl);
  
         return err;
  }
@@ -2415,12 +2441,12 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
  static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
  {
         struct inode *inode = file_inode(file);
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
         FUSE_ARGS(args);
         struct fuse_lk_in inarg;
         int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
         struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL;
-       pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns);
+       pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns);
         int err;
  
         if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
@@ -2433,7 +2459,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
                 return 0;
  
         fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
-       err = fuse_simple_request(fc, &args);
+       err = fuse_simple_request(fm, &args);
  
         /* locking is restartable */
         if (err == -EINTR)
@@ -2487,13 +2513,13 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
  static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
  {
         struct inode *inode = mapping->host;
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
         FUSE_ARGS(args);
         struct fuse_bmap_in inarg;
         struct fuse_bmap_out outarg;
         int err;
  
-       if (!inode->i_sb->s_bdev || fc->no_bmap)
+       if (!inode->i_sb->s_bdev || fm->fc->no_bmap)
                 return 0;
  
         memset(&inarg, 0, sizeof(inarg));
@@ -2507,9 +2533,9 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
         args.out_numargs = 1;
         args.out_args[0].size = sizeof(outarg);
         args.out_args[0].value = &outarg;
-       err = fuse_simple_request(fc, &args);
+       err = fuse_simple_request(fm, &args);
         if (err == -ENOSYS)
-               fc->no_bmap = 1;
+               fm->fc->no_bmap = 1;
  
         return err ? 0 : outarg.block;
  }
@@ -2517,7 +2543,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
  static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
  {
         struct inode *inode = file->f_mapping->host;
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
         struct fuse_file *ff = file->private_data;
         FUSE_ARGS(args);
         struct fuse_lseek_in inarg = {
@@ -2528,7 +2554,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
         struct fuse_lseek_out outarg;
         int err;
  
-       if (fc->no_lseek)
+       if (fm->fc->no_lseek)
                 goto fallback;
  
         args.opcode = FUSE_LSEEK;
@@ -2539,10 +2565,10 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
         args.out_numargs = 1;
         args.out_args[0].size = sizeof(outarg);
         args.out_args[0].value = &outarg;
-       err = fuse_simple_request(fc, &args);
+       err = fuse_simple_request(fm, &args);
         if (err) {
                 if (err == -ENOSYS) {
-                       fc->no_lseek = 1;
+                       fm->fc->no_lseek = 1;
                         goto fallback;
                 }
                 return err;
@@ -2728,7 +2754,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
                    unsigned int flags)
  {
         struct fuse_file *ff = file->private_data;
-       struct fuse_conn *fc = ff->fc;
+       struct fuse_mount *fm = ff->fm;
         struct fuse_ioctl_in inarg = {
                 .fh = ff->fh,
                 .cmd = cmd,
@@ -2761,12 +2787,12 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
         BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
  
         err = -ENOMEM;
-       ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs);
+       ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs);
         iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
         if (!ap.pages || !iov_page)
                 goto out;
  
-       fuse_page_descs_length_init(ap.descs, 0, fc->max_pages);
+       fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages);
  
         /*
          * If restricted, initialize IO parameters as encoded in @cmd.
@@ -2811,7 +2837,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
  
         /* make sure there are enough buffer pages and init request with them */
         err = -ENOMEM;
-       if (max_pages > fc->max_pages)
+       if (max_pages > fm->fc->max_pages)
                 goto out;
         while (ap.num_pages < max_pages) {
                 ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
@@ -2848,7 +2874,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
         ap.args.out_pages = true;
         ap.args.out_argvar = true;
  
-       transferred = fuse_simple_request(fc, &ap.args);
+       transferred = fuse_simple_request(fm, &ap.args);
         err = transferred;
         if (transferred < 0)
                 goto out;
@@ -2876,7 +2902,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
                         goto out;
  
                 vaddr = kmap_atomic(ap.pages[0]);
-               err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
+               err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr,
                                             transferred, in_iovs + out_iovs,
                                             (flags & FUSE_IOCTL_COMPAT) != 0);
                 kunmap_atomic(vaddr);
@@ -2886,11 +2912,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
                 in_iov = iov_page;
                 out_iov = in_iov + in_iovs;
  
-               err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs);
+               err = fuse_verify_ioctl_iov(fm->fc, in_iov, in_iovs);
                 if (err)
                         goto out;
  
-               err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs);
+               err = fuse_verify_ioctl_iov(fm->fc, out_iov, out_iovs);
                 if (err)
                         goto out;
  
@@ -3000,13 +3026,13 @@ static void fuse_register_polled_file(struct fuse_conn *fc,
  __poll_t fuse_file_poll(struct file *file, poll_table *wait)
  {
         struct fuse_file *ff = file->private_data;
-       struct fuse_conn *fc = ff->fc;
+       struct fuse_mount *fm = ff->fm;
         struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
         struct fuse_poll_out outarg;
         FUSE_ARGS(args);
         int err;
  
-       if (fc->no_poll)
+       if (fm->fc->no_poll)
                 return DEFAULT_POLLMASK;
  
         poll_wait(file, &ff->poll_wait, wait);
@@ -3018,7 +3044,7 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait)
          */
         if (waitqueue_active(&ff->poll_wait)) {
                 inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
-               fuse_register_polled_file(fc, ff);
+               fuse_register_polled_file(fm->fc, ff);
         }
  
         args.opcode = FUSE_POLL;
@@ -3029,12 +3055,12 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait)
         args.out_numargs = 1;
         args.out_args[0].size = sizeof(outarg);
         args.out_args[0].value = &outarg;
-       err = fuse_simple_request(fc, &args);
+       err = fuse_simple_request(fm, &args);
  
         if (!err)
                 return demangle_poll(outarg.revents);
         if (err == -ENOSYS) {
-               fc->no_poll = 1;
+               fm->fc->no_poll = 1;
                 return DEFAULT_POLLMASK;
         }
         return EPOLLERR;
@@ -3120,13 +3146,13 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
          * By default, we want to optimize all I/Os with async request
          * submission to the client filesystem if supported.
          */
-       io->async = ff->fc->async_dio;
+       io->async = ff->fm->fc->async_dio;
         io->iocb = iocb;
         io->blocking = is_sync_kiocb(iocb);
  
         /* optimization for short read */
         if (io->async && !io->write && offset + count > i_size) {
-               iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset));
+               iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset));
                 shortened = count - iov_iter_count(iter);
                 count -= shortened;
         }
@@ -3196,7 +3222,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
         struct fuse_file *ff = file->private_data;
         struct inode *inode = file_inode(file);
         struct fuse_inode *fi = get_fuse_inode(inode);
-       struct fuse_conn *fc = ff->fc;
+       struct fuse_mount *fm = ff->fm;
         FUSE_ARGS(args);
         struct fuse_fallocate_in inarg = {
                 .fh = ff->fh,
@@ -3208,14 +3234,23 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
         bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
                            (mode & FALLOC_FL_PUNCH_HOLE);
  
+       bool block_faults = FUSE_IS_DAX(inode) && lock_inode;
+
         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
                 return -EOPNOTSUPP;
  
-       if (fc->no_fallocate)
+       if (fm->fc->no_fallocate)
                 return -EOPNOTSUPP;
  
         if (lock_inode) {
                 inode_lock(inode);
+               if (block_faults) {
+                       down_write(&fi->i_mmap_sem);
+                       err = fuse_dax_break_layouts(inode, 0, 0);
+                       if (err)
+                               goto out;
+               }
+
                 if (mode & FALLOC_FL_PUNCH_HOLE) {
                         loff_t endbyte = offset + length - 1;
  
@@ -3240,9 +3275,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
         args.in_numargs = 1;
         args.in_args[0].size = sizeof(inarg);
         args.in_args[0].value = &inarg;
-       err = fuse_simple_request(fc, &args);
+       err = fuse_simple_request(fm, &args);
         if (err == -ENOSYS) {
-               fc->no_fallocate = 1;
+               fm->fc->no_fallocate = 1;
                 err = -EOPNOTSUPP;
         }
         if (err)
@@ -3252,7 +3287,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
         if (!(mode & FALLOC_FL_KEEP_SIZE)) {
                 bool changed = fuse_write_update_size(inode, offset + length);
  
-               if (changed && fc->writeback_cache)
+               if (changed && fm->fc->writeback_cache)
                         file_update_time(file);
         }
  
@@ -3265,6 +3300,9 @@ out:
         if (!(mode & FALLOC_FL_KEEP_SIZE))
                 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
  
+       if (block_faults)
+               up_write(&fi->i_mmap_sem);
+
         if (lock_inode)
                 inode_unlock(inode);
  
@@ -3280,7 +3318,8 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
         struct inode *inode_in = file_inode(file_in);
         struct inode *inode_out = file_inode(file_out);
         struct fuse_inode *fi_out = get_fuse_inode(inode_out);
-       struct fuse_conn *fc = ff_in->fc;
+       struct fuse_mount *fm = ff_in->fm;
+       struct fuse_conn *fc = fm->fc;
         FUSE_ARGS(args);
         struct fuse_copy_file_range_in inarg = {
                 .fh_in = ff_in->fh,
@@ -3349,7 +3388,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
         args.out_numargs = 1;
         args.out_args[0].size = sizeof(outarg);
         args.out_args[0].value = &outarg;
-       err = fuse_simple_request(fc, &args);
+       err = fuse_simple_request(fm, &args);
         if (err == -ENOSYS) {
                 fc->no_copy_file_range = 1;
                 err = -EOPNOTSUPP;
@@ -3404,6 +3443,7 @@ static const struct file_operations fuse_file_operations = {
         .release        = fuse_release,
         .fsync          = fuse_fsync,
         .lock           = fuse_file_lock,
+       .get_unmapped_area = thp_get_unmapped_area,
         .flock          = fuse_file_flock,
         .splice_read    = generic_file_splice_read,
         .splice_write   = iter_file_splice_write,
@@ -3439,4 +3479,7 @@ void fuse_init_file_inode(struct inode *inode)
         fi->writectr = 0;
         init_waitqueue_head(&fi->page_waitq);
         fi->writepages = RB_ROOT;
+
+       if (IS_ENABLED(CONFIG_FUSE_DAX))
+               fuse_dax_inode_init(inode);
  }
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h

index 740a8a7..d515980 100644 (file)
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -148,6 +148,20 @@ struct fuse_inode {
  
         /** Lock to protect write related fields */
         spinlock_t lock;
+
+       /**
+        * Can't take inode lock in fault path (leads to circular dependency).
+        * Introduce another semaphore which can be taken in fault path and
+        * then other filesystem paths can take this to block faults.
+        */
+       struct rw_semaphore i_mmap_sem;
+
+#ifdef CONFIG_FUSE_DAX
+       /*
+        * Dax specific inode data
+        */
+       struct fuse_inode_dax *dax;
+#endif
  };
  
  /** FUSE inode state bits */
@@ -161,12 +175,13 @@ enum {
  };
  
  struct fuse_conn;
+struct fuse_mount;
  struct fuse_release_args;
  
  /** FUSE specific file data */
  struct fuse_file {
         /** Fuse connection for this file */
-       struct fuse_conn *fc;
+       struct fuse_mount *fm;
  
         /* Argument space reserved for release */
         struct fuse_release_args *release_args;
@@ -252,7 +267,7 @@ struct fuse_args {
         bool may_block:1;
         struct fuse_in_arg in_args[3];
         struct fuse_arg out_args[2];
-       void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error);
+       void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error);
  };
  
  struct fuse_args_pages {
@@ -360,6 +375,9 @@ struct fuse_req {
         /** virtio-fs's physically contiguous buffer for in and out args */
         void *argbuf;
  #endif
+
+       /** fuse_mount this request belongs to */
+       struct fuse_mount *fm;
  };
  
  struct fuse_iqueue;
@@ -482,11 +500,15 @@ struct fuse_fs_context {
         bool destroy:1;
         bool no_control:1;
         bool no_force_umount:1;
-       bool no_mount_options:1;
+       bool legacy_opts_show:1;
+       bool dax:1;
         unsigned int max_read;
         unsigned int blksize;
         const char *subtype;
  
+       /* DAX device, may be NULL */
+       struct dax_device *dax_dev;
+
         /* fuse_dev pointer to fill in, should contain NULL on entry */
         void **fudptr;
  };
@@ -494,9 +516,9 @@ struct fuse_fs_context {
  /**
   * A Fuse connection.
   *
- * This structure is created, when the filesystem is mounted, and is
- * destroyed, when the client device is closed and the filesystem is
- * unmounted.
+ * This structure is created, when the root filesystem is mounted, and
+ * is destroyed, when the client device is closed and the last
+ * fuse_mount is destroyed.
   */
  struct fuse_conn {
         /** Lock protecting accessess to  members of this structure */
@@ -610,6 +632,9 @@ struct fuse_conn {
         /** cache READLINK responses in page cache */
         unsigned cache_symlinks:1;
  
+       /* show legacy mount options */
+       unsigned int legacy_opts_show:1;
+
         /*
          * The following bitfields are only for optimization purposes
          * and hence races in setting them will not cause malfunction
@@ -717,8 +742,8 @@ struct fuse_conn {
         /** Do not allow MNT_FORCE umount */
         unsigned int no_force_umount:1;
  
-       /* Do not show mount options */
-       unsigned int no_mount_options:1;
+       /* Auto-mount submounts announced by the server */
+       unsigned int auto_submounts:1;
  
         /** The number of requests waiting for completion */
         atomic_t num_waiting;
@@ -726,10 +751,10 @@ struct fuse_conn {
         /** Negotiated minor version */
         unsigned minor;
  
-       /** Entry on the fuse_conn_list */
+       /** Entry on the fuse_mount_list */
         struct list_head entry;
  
-       /** Device ID from super block */
+       /** Device ID from the root super block */
         dev_t dev;
  
         /** Dentries in the control filesystem */
@@ -747,24 +772,70 @@ struct fuse_conn {
         /** Called on final put */
         void (*release)(struct fuse_conn *);
  
-       /** Super block for this connection. */
-       struct super_block *sb;
-
-       /** Read/write semaphore to hold when accessing sb. */
+       /**
+        * Read/write semaphore to hold when accessing the sb of any
+        * fuse_mount belonging to this connection
+        */
         struct rw_semaphore killsb;
  
         /** List of device instances belonging to this connection */
         struct list_head devices;
+
+#ifdef CONFIG_FUSE_DAX
+       /* Dax specific conn data, non-NULL if DAX is enabled */
+       struct fuse_conn_dax *dax;
+#endif
+
+       /** List of filesystems using this connection */
+       struct list_head mounts;
  };
  
-static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
+/*
+ * Represents a mounted filesystem, potentially a submount.
+ *
+ * This object allows sharing a fuse_conn between separate mounts to
+ * allow submounts with dedicated superblocks and thus separate device
+ * IDs.
+ */
+struct fuse_mount {
+       /* Underlying (potentially shared) connection to the FUSE server */
+       struct fuse_conn *fc;
+
+       /* Refcount */
+       refcount_t count;
+
+       /*
+        * Super block for this connection (fc->killsb must be held when
+        * accessing this).
+        */
+       struct super_block *sb;
+
+       /* Entry on fc->mounts */
+       struct list_head fc_entry;
+};
+
+static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb)
  {
         return sb->s_fs_info;
  }
  
+static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
+{
+       struct fuse_mount *fm = get_fuse_mount_super(sb);
+
+       return fm ? fm->fc : NULL;
+}
+
+static inline struct fuse_mount *get_fuse_mount(struct inode *inode)
+{
+       return get_fuse_mount_super(inode->i_sb);
+}
+
  static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
  {
-       return get_fuse_conn_super(inode->i_sb);
+       struct fuse_mount *fm = get_fuse_mount(inode);
+
+       return fm ? fm->fc : NULL;
  }
  
  static inline struct fuse_inode *get_fuse_inode(struct inode *inode)
@@ -794,11 +865,6 @@ extern const struct dentry_operations fuse_dentry_operations;
  extern const struct dentry_operations fuse_root_dentry_operations;
  
  /**
- * Inode to nodeid comparison.
- */
-int fuse_inode_eq(struct inode *inode, void *_nodeidp);
-
-/**
   * Get a filled in inode
   */
  struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
@@ -848,7 +914,7 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
   */
  int fuse_open_common(struct inode *inode, struct file *file, bool isdir);
  
-struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
+struct fuse_file *fuse_file_alloc(struct fuse_mount *fm);
  void fuse_file_free(struct fuse_file *ff);
  void fuse_finish_open(struct inode *inode, struct file *file);
  
@@ -916,14 +982,14 @@ void __exit fuse_ctl_cleanup(void);
  /**
   * Simple request sending that does request allocation and freeing
   */
-ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args);
-int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args,
+ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args);
+int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
                            gfp_t gfp_flags);
  
  /**
   * End a finished request
   */
-void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_end(struct fuse_req *req);
  
  /* Abort all requests */
  void fuse_abort_conn(struct fuse_conn *fc);
@@ -949,7 +1015,8 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
  /**
   * Initialize fuse_conn
   */
-void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
+void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
+                   struct user_namespace *user_ns,
                     const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv);
  
  /**
@@ -957,11 +1024,21 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
   */
  void fuse_conn_put(struct fuse_conn *fc);
  
+/**
+ * Acquire reference to fuse_mount
+ */
+struct fuse_mount *fuse_mount_get(struct fuse_mount *fm);
+
+/**
+ * Release reference to fuse_mount
+ */
+void fuse_mount_put(struct fuse_mount *fm);
+
  struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc);
  struct fuse_dev *fuse_dev_alloc(void);
  void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc);
  void fuse_dev_free(struct fuse_dev *fud);
-void fuse_send_init(struct fuse_conn *fc);
+void fuse_send_init(struct fuse_mount *fm);
  
  /**
   * Fill in superblock and initialize fuse connection
@@ -970,12 +1047,26 @@ void fuse_send_init(struct fuse_conn *fc);
   */
  int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx);
  
-/**
- * Disassociate fuse connection from superblock and kill the superblock
+/*
+ * Fill in superblock for submounts
+ * @sb: partially-initialized superblock to fill in
+ * @parent_fi: The fuse_inode of the parent filesystem where this submount is
+ *            mounted
+ */
+int fuse_fill_super_submount(struct super_block *sb,
+                            struct fuse_inode *parent_fi);
+
+/*
+ * Remove the mount from the connection
   *
- * Calls kill_anon_super(), do not use with bdev mounts.
+ * Returns whether this was the last mount
   */
-void fuse_kill_sb_anon(struct super_block *sb);
+bool fuse_mount_remove(struct fuse_mount *fm);
+
+/*
+ * Shut down the connection (possibly sending DESTROY request).
+ */
+void fuse_conn_destroy(struct fuse_mount *fm);
  
  /**
   * Add connection to control filesystem
@@ -1011,9 +1102,19 @@ void fuse_set_nowrite(struct inode *inode);
  void fuse_release_nowrite(struct inode *inode);
  
  /**
+ * Scan all fuse_mounts belonging to fc to find the first where
+ * ilookup5() returns a result.  Return that result and the
+ * respective fuse_mount in *fm (unless fm is NULL).
+ *
+ * The caller must hold fc->killsb.
+ */
+struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid,
+                          struct fuse_mount **fm);
+
+/**
   * File-system tells the kernel to invalidate cache for the given node id.
   */
-int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
                              loff_t offset, loff_t len);
  
  /**
@@ -1026,10 +1127,10 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
   *    - is a file or oan empty directory
   * then the dentry is unhashed (d_delete()).
   */
-int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
                              u64 child_nodeid, struct qstr *name);
  
-int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
                  bool isdir);
  
  /**
@@ -1093,4 +1194,20 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args);
  u64 fuse_get_unique(struct fuse_iqueue *fiq);
  void fuse_free_conn(struct fuse_conn *fc);
  
+/* dax.c */
+
+#define FUSE_IS_DAX(inode) (IS_ENABLED(CONFIG_FUSE_DAX) && IS_DAX(inode))
+
+ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
+ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
+int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma);
+int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end);
+int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev);
+void fuse_dax_conn_free(struct fuse_conn *fc);
+bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi);
+void fuse_dax_inode_init(struct inode *inode);
+void fuse_dax_inode_cleanup(struct inode *inode);
+bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment);
+void fuse_dax_cancel_work(struct fuse_conn *fc);
+
  #endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c

index 5813292..1a47afc 100644 (file)
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -85,14 +85,22 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
         fi->orig_ino = 0;
         fi->state = 0;
         mutex_init(&fi->mutex);
+       init_rwsem(&fi->i_mmap_sem);
         spin_lock_init(&fi->lock);
         fi->forget = fuse_alloc_forget();
-       if (!fi->forget) {
-               kmem_cache_free(fuse_inode_cachep, fi);
-               return NULL;
-       }
+       if (!fi->forget)
+               goto out_free;
+
+       if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi))
+               goto out_free_forget;
  
         return &fi->inode;
+
+out_free_forget:
+       kfree(fi->forget);
+out_free:
+       kmem_cache_free(fuse_inode_cachep, fi);
+       return NULL;
  }
  
  static void fuse_free_inode(struct inode *inode)
@@ -101,6 +109,9 @@ static void fuse_free_inode(struct inode *inode)
  
         mutex_destroy(&fi->mutex);
         kfree(fi->forget);
+#ifdef CONFIG_FUSE_DAX
+       kfree(fi->dax);
+#endif
         kmem_cache_free(fuse_inode_cachep, fi);
  }
  
@@ -112,8 +123,14 @@ static void fuse_evict_inode(struct inode *inode)
         clear_inode(inode);
         if (inode->i_sb->s_flags & SB_ACTIVE) {
                 struct fuse_conn *fc = get_fuse_conn(inode);
-               fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
-               fi->forget = NULL;
+
+               if (FUSE_IS_DAX(inode))
+                       fuse_dax_inode_cleanup(inode);
+               if (fi->nlookup) {
+                       fuse_queue_forget(fc, fi->forget, fi->nodeid,
+                                         fi->nlookup);
+                       fi->forget = NULL;
+               }
         }
         if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) {
                 WARN_ON(!list_empty(&fi->write_files));
@@ -268,7 +285,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
                 BUG();
  }
  
-int fuse_inode_eq(struct inode *inode, void *_nodeidp)
+static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
  {
         u64 nodeid = *(u64 *) _nodeidp;
         if (get_node_id(inode) == nodeid)
@@ -292,7 +309,26 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
         struct fuse_inode *fi;
         struct fuse_conn *fc = get_fuse_conn_super(sb);
  
- retry:
+       /*
+        * Auto mount points get their node id from the submount root, which is
+        * not a unique identifier within this filesystem.
+        *
+        * To avoid conflicts, do not place submount points into the inode hash
+        * table.
+        */
+       if (fc->auto_submounts && (attr->flags & FUSE_ATTR_SUBMOUNT) &&
+           S_ISDIR(attr->mode)) {
+               inode = new_inode(sb);
+               if (!inode)
+                       return NULL;
+
+               fuse_init_inode(inode, attr);
+               get_fuse_inode(inode)->nodeid = nodeid;
+               inode->i_flags |= S_AUTOMOUNT;
+               goto done;
+       }
+
+retry:
         inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid);
         if (!inode)
                 return NULL;
@@ -310,7 +346,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
                 iput(inode);
                 goto retry;
         }
-
+done:
         fi = get_fuse_inode(inode);
         spin_lock(&fi->lock);
         fi->nlookup++;
@@ -320,16 +356,37 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
         return inode;
  }
  
-int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid,
+                          struct fuse_mount **fm)
+{
+       struct fuse_mount *fm_iter;
+       struct inode *inode;
+
+       WARN_ON(!rwsem_is_locked(&fc->killsb));
+       list_for_each_entry(fm_iter, &fc->mounts, fc_entry) {
+               if (!fm_iter->sb)
+                       continue;
+
+               inode = ilookup5(fm_iter->sb, nodeid, fuse_inode_eq, &nodeid);
+               if (inode) {
+                       if (fm)
+                               *fm = fm_iter;
+                       return inode;
+               }
+       }
+
+       return NULL;
+}
+
+int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
                              loff_t offset, loff_t len)
  {
-       struct fuse_conn *fc = get_fuse_conn_super(sb);
         struct fuse_inode *fi;
         struct inode *inode;
         pgoff_t pg_start;
         pgoff_t pg_end;
  
-       inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid);
+       inode = fuse_ilookup(fc, nodeid, NULL);
         if (!inode)
                 return -ENOENT;
  
@@ -379,28 +436,23 @@ static void fuse_umount_begin(struct super_block *sb)
                 fuse_abort_conn(fc);
  }
  
-static void fuse_send_destroy(struct fuse_conn *fc)
+static void fuse_send_destroy(struct fuse_mount *fm)
  {
-       if (fc->conn_init) {
+       if (fm->fc->conn_init) {
                 FUSE_ARGS(args);
  
                 args.opcode = FUSE_DESTROY;
                 args.force = true;
                 args.nocreds = true;
-               fuse_simple_request(fc, &args);
+               fuse_simple_request(fm, &args);
         }
  }
  
  static void fuse_put_super(struct super_block *sb)
  {
-       struct fuse_conn *fc = get_fuse_conn_super(sb);
+       struct fuse_mount *fm = get_fuse_mount_super(sb);
  
-       mutex_lock(&fuse_mutex);
-       list_del(&fc->entry);
-       fuse_ctl_remove_conn(fc);
-       mutex_unlock(&fuse_mutex);
-
-       fuse_conn_put(fc);
+       fuse_mount_put(fm);
  }
  
  static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
@@ -420,12 +472,12 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr
  static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
  {
         struct super_block *sb = dentry->d_sb;
-       struct fuse_conn *fc = get_fuse_conn_super(sb);
+       struct fuse_mount *fm = get_fuse_mount_super(sb);
         FUSE_ARGS(args);
         struct fuse_statfs_out outarg;
         int err;
  
-       if (!fuse_allow_current_process(fc)) {
+       if (!fuse_allow_current_process(fm->fc)) {
                 buf->f_type = FUSE_SUPER_MAGIC;
                 return 0;
         }
@@ -437,7 +489,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
         args.out_numargs = 1;
         args.out_args[0].size = sizeof(outarg);
         args.out_args[0].value = &outarg;
-       err = fuse_simple_request(fc, &args);
+       err = fuse_simple_request(fm, &args);
         if (!err)
                 convert_fuse_statfs(buf, &outarg.st);
         return err;
@@ -573,19 +625,25 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
         struct super_block *sb = root->d_sb;
         struct fuse_conn *fc = get_fuse_conn_super(sb);
  
-       if (fc->no_mount_options)
-               return 0;
+       if (fc->legacy_opts_show) {
+               seq_printf(m, ",user_id=%u",
+                          from_kuid_munged(fc->user_ns, fc->user_id));
+               seq_printf(m, ",group_id=%u",
+                          from_kgid_munged(fc->user_ns, fc->group_id));
+               if (fc->default_permissions)
+                       seq_puts(m, ",default_permissions");
+               if (fc->allow_other)
+                       seq_puts(m, ",allow_other");
+               if (fc->max_read != ~0)
+                       seq_printf(m, ",max_read=%u", fc->max_read);
+               if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
+                       seq_printf(m, ",blksize=%lu", sb->s_blocksize);
+       }
+#ifdef CONFIG_FUSE_DAX
+       if (fc->dax)
+               seq_puts(m, ",dax");
+#endif
  
-       seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id));
-       seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id));
-       if (fc->default_permissions)
-               seq_puts(m, ",default_permissions");
-       if (fc->allow_other)
-               seq_puts(m, ",allow_other");
-       if (fc->max_read != ~0)
-               seq_printf(m, ",max_read=%u", fc->max_read);
-       if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
-               seq_printf(m, ",blksize=%lu", sb->s_blocksize);
         return 0;
  }
  
@@ -615,7 +673,8 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq)
         fpq->connected = 1;
  }
  
-void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
+void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
+                   struct user_namespace *user_ns,
                     const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv)
  {
         memset(fc, 0, sizeof(*fc));
@@ -642,6 +701,11 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
         fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
         fc->user_ns = get_user_ns(user_ns);
         fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ;
+
+       INIT_LIST_HEAD(&fc->mounts);
+       list_add(&fm->fc_entry, &fc->mounts);
+       fm->fc = fc;
+       refcount_set(&fm->count, 1);
  }
  EXPORT_SYMBOL_GPL(fuse_conn_init);
  
@@ -650,6 +714,8 @@ void fuse_conn_put(struct fuse_conn *fc)
         if (refcount_dec_and_test(&fc->count)) {
                 struct fuse_iqueue *fiq = &fc->iq;
  
+               if (IS_ENABLED(CONFIG_FUSE_DAX))
+                       fuse_dax_conn_free(fc);
                 if (fiq->ops->release)
                         fiq->ops->release(fiq);
                 put_pid_ns(fc->pid_ns);
@@ -666,6 +732,23 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
  }
  EXPORT_SYMBOL_GPL(fuse_conn_get);
  
+void fuse_mount_put(struct fuse_mount *fm)
+{
+       if (refcount_dec_and_test(&fm->count)) {
+               if (fm->fc)
+                       fuse_conn_put(fm->fc);
+               kfree(fm);
+       }
+}
+EXPORT_SYMBOL_GPL(fuse_mount_put);
+
+struct fuse_mount *fuse_mount_get(struct fuse_mount *fm)
+{
+       refcount_inc(&fm->count);
+       return fm;
+}
+EXPORT_SYMBOL_GPL(fuse_mount_get);
+
  static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
  {
         struct fuse_attr attr;
@@ -895,14 +978,16 @@ struct fuse_init_args {
         struct fuse_init_out out;
  };
  
-static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
+static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
                                int error)
  {
+       struct fuse_conn *fc = fm->fc;
         struct fuse_init_args *ia = container_of(args, typeof(*ia), args);
         struct fuse_init_out *arg = &ia->out;
+       bool ok = true;
  
         if (error || arg->major != FUSE_KERNEL_VERSION)
-               fc->conn_error = 1;
+               ok = false;
         else {
                 unsigned long ra_pages;
  
@@ -950,11 +1035,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
                         if (arg->flags & FUSE_HANDLE_KILLPRIV)
                                 fc->handle_killpriv = 1;
                         if (arg->time_gran && arg->time_gran <= 1000000000)
-                               fc->sb->s_time_gran = arg->time_gran;
+                               fm->sb->s_time_gran = arg->time_gran;
                         if ((arg->flags & FUSE_POSIX_ACL)) {
                                 fc->default_permissions = 1;
                                 fc->posix_acl = 1;
-                               fc->sb->s_xattr = fuse_acl_xattr_handlers;
+                               fm->sb->s_xattr = fuse_acl_xattr_handlers;
                         }
                         if (arg->flags & FUSE_CACHE_SYMLINKS)
                                 fc->cache_symlinks = 1;
@@ -965,14 +1050,19 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
                                         min_t(unsigned int, FUSE_MAX_MAX_PAGES,
                                         max_t(unsigned int, arg->max_pages, 1));
                         }
+                       if (IS_ENABLED(CONFIG_FUSE_DAX) &&
+                           arg->flags & FUSE_MAP_ALIGNMENT &&
+                           !fuse_dax_check_alignment(fc, arg->map_alignment)) {
+                               ok = false;
+                       }
                 } else {
                         ra_pages = fc->max_read / PAGE_SIZE;
                         fc->no_lock = 1;
                         fc->no_flock = 1;
                 }
  
-               fc->sb->s_bdi->ra_pages =
-                               min(fc->sb->s_bdi->ra_pages, ra_pages);
+               fm->sb->s_bdi->ra_pages =
+                               min(fm->sb->s_bdi->ra_pages, ra_pages);
                 fc->minor = arg->minor;
                 fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
                 fc->max_write = max_t(unsigned, 4096, fc->max_write);
@@ -980,11 +1070,16 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
         }
         kfree(ia);
  
+       if (!ok) {
+               fc->conn_init = 0;
+               fc->conn_error = 1;
+       }
+
         fuse_set_initialized(fc);
         wake_up_all(&fc->blocked_waitq);
  }
  
-void fuse_send_init(struct fuse_conn *fc)
+void fuse_send_init(struct fuse_mount *fm)
  {
         struct fuse_init_args *ia;
  
@@ -992,7 +1087,7 @@ void fuse_send_init(struct fuse_conn *fc)
  
         ia->in.major = FUSE_KERNEL_VERSION;
         ia->in.minor = FUSE_KERNEL_MINOR_VERSION;
-       ia->in.max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE;
+       ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE;
         ia->in.flags |=
                 FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
                 FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
@@ -1003,6 +1098,13 @@ void fuse_send_init(struct fuse_conn *fc)
                 FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
                 FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
                 FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA;
+#ifdef CONFIG_FUSE_DAX
+       if (fm->fc->dax)
+               ia->in.flags |= FUSE_MAP_ALIGNMENT;
+#endif
+       if (fm->fc->auto_submounts)
+               ia->in.flags |= FUSE_SUBMOUNTS;
+
         ia->args.opcode = FUSE_INIT;
         ia->args.in_numargs = 1;
         ia->args.in_args[0].size = sizeof(ia->in);
@@ -1018,8 +1120,8 @@ void fuse_send_init(struct fuse_conn *fc)
         ia->args.nocreds = true;
         ia->args.end = process_init_reply;
  
-       if (fuse_simple_background(fc, &ia->args, GFP_KERNEL) != 0)
-               process_init_reply(fc, &ia->args, -ENOTCONN);
+       if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0)
+               process_init_reply(fm, &ia->args, -ENOTCONN);
  }
  EXPORT_SYMBOL_GPL(fuse_send_init);
  
@@ -1130,10 +1232,92 @@ void fuse_dev_free(struct fuse_dev *fud)
  }
  EXPORT_SYMBOL_GPL(fuse_dev_free);
  
+static void fuse_fill_attr_from_inode(struct fuse_attr *attr,
+                                     const struct fuse_inode *fi)
+{
+       *attr = (struct fuse_attr){
+               .ino            = fi->inode.i_ino,
+               .size           = fi->inode.i_size,
+               .blocks         = fi->inode.i_blocks,
+               .atime          = fi->inode.i_atime.tv_sec,
+               .mtime          = fi->inode.i_mtime.tv_sec,
+               .ctime          = fi->inode.i_ctime.tv_sec,
+               .atimensec      = fi->inode.i_atime.tv_nsec,
+               .mtimensec      = fi->inode.i_mtime.tv_nsec,
+               .ctimensec      = fi->inode.i_ctime.tv_nsec,
+               .mode           = fi->inode.i_mode,
+               .nlink          = fi->inode.i_nlink,
+               .uid            = fi->inode.i_uid.val,
+               .gid            = fi->inode.i_gid.val,
+               .rdev           = fi->inode.i_rdev,
+               .blksize        = 1u << fi->inode.i_blkbits,
+       };
+}
+
+static void fuse_sb_defaults(struct super_block *sb)
+{
+       sb->s_magic = FUSE_SUPER_MAGIC;
+       sb->s_op = &fuse_super_operations;
+       sb->s_xattr = fuse_xattr_handlers;
+       sb->s_maxbytes = MAX_LFS_FILESIZE;
+       sb->s_time_gran = 1;
+       sb->s_export_op = &fuse_export_operations;
+       sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE;
+       if (sb->s_user_ns != &init_user_ns)
+               sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
+       sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
+
+       /*
+        * If we are not in the initial user namespace posix
+        * acls must be translated.
+        */
+       if (sb->s_user_ns != &init_user_ns)
+               sb->s_xattr = fuse_no_acl_xattr_handlers;
+}
+
+int fuse_fill_super_submount(struct super_block *sb,
+                            struct fuse_inode *parent_fi)
+{
+       struct fuse_mount *fm = get_fuse_mount_super(sb);
+       struct super_block *parent_sb = parent_fi->inode.i_sb;
+       struct fuse_attr root_attr;
+       struct inode *root;
+
+       fuse_sb_defaults(sb);
+       fm->sb = sb;
+
+       WARN_ON(sb->s_bdi != &noop_backing_dev_info);
+       sb->s_bdi = bdi_get(parent_sb->s_bdi);
+
+       sb->s_xattr = parent_sb->s_xattr;
+       sb->s_time_gran = parent_sb->s_time_gran;
+       sb->s_blocksize = parent_sb->s_blocksize;
+       sb->s_blocksize_bits = parent_sb->s_blocksize_bits;
+       sb->s_subtype = kstrdup(parent_sb->s_subtype, GFP_KERNEL);
+       if (parent_sb->s_subtype && !sb->s_subtype)
+               return -ENOMEM;
+
+       fuse_fill_attr_from_inode(&root_attr, parent_fi);
+       root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0);
+       /*
+        * This inode is just a duplicate, so it is not looked up and
+        * its nlookup should not be incremented.  fuse_iget() does
+        * that, though, so undo it here.
+        */
+       get_fuse_inode(root)->nlookup--;
+       sb->s_d_op = &fuse_dentry_operations;
+       sb->s_root = d_make_root(root);
+       if (!sb->s_root)
+               return -ENOMEM;
+
+       return 0;
+}
+
  int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
  {
         struct fuse_dev *fud = NULL;
-       struct fuse_conn *fc = get_fuse_conn_super(sb);
+       struct fuse_mount *fm = get_fuse_mount_super(sb);
+       struct fuse_conn *fc = fm->fc;
         struct inode *root;
         struct dentry *root_dentry;
         int err;
@@ -1142,7 +1326,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
         if (sb->s_flags & SB_MANDLOCK)
                 goto err;
  
-       sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
+       fuse_sb_defaults(sb);
  
         if (ctx->is_bdev) {
  #ifdef CONFIG_BLOCK
@@ -1157,32 +1341,21 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
  
         sb->s_subtype = ctx->subtype;
         ctx->subtype = NULL;
-       sb->s_magic = FUSE_SUPER_MAGIC;
-       sb->s_op = &fuse_super_operations;
-       sb->s_xattr = fuse_xattr_handlers;
-       sb->s_maxbytes = MAX_LFS_FILESIZE;
-       sb->s_time_gran = 1;
-       sb->s_export_op = &fuse_export_operations;
-       sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE;
-       if (sb->s_user_ns != &init_user_ns)
-               sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
-
-       /*
-        * If we are not in the initial user namespace posix
-        * acls must be translated.
-        */
-       if (sb->s_user_ns != &init_user_ns)
-               sb->s_xattr = fuse_no_acl_xattr_handlers;
+       if (IS_ENABLED(CONFIG_FUSE_DAX)) {
+               err = fuse_dax_conn_alloc(fc, ctx->dax_dev);
+               if (err)
+                       goto err;
+       }
  
         if (ctx->fudptr) {
                 err = -ENOMEM;
                 fud = fuse_dev_alloc_install(fc);
                 if (!fud)
-                       goto err;
+                       goto err_free_dax;
         }
  
         fc->dev = sb->s_dev;
-       fc->sb = sb;
+       fm->sb = sb;
         err = fuse_bdi_init(fc, sb);
         if (err)
                 goto err_dev_free;
@@ -1196,11 +1369,11 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
         fc->allow_other = ctx->allow_other;
         fc->user_id = ctx->user_id;
         fc->group_id = ctx->group_id;
-       fc->max_read = max_t(unsigned, 4096, ctx->max_read);
+       fc->legacy_opts_show = ctx->legacy_opts_show;
+       fc->max_read = max_t(unsigned int, 4096, ctx->max_read);
         fc->destroy = ctx->destroy;
         fc->no_control = ctx->no_control;
         fc->no_force_umount = ctx->no_force_umount;
-       fc->no_mount_options = ctx->no_mount_options;
  
         err = -ENOMEM;
         root = fuse_get_root_inode(sb, ctx->rootmode);
@@ -1233,6 +1406,9 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
   err_dev_free:
         if (fud)
                 fuse_dev_free(fud);
+ err_free_dax:
+       if (IS_ENABLED(CONFIG_FUSE_DAX))
+               fuse_dax_conn_free(fc);
   err:
         return err;
  }
@@ -1244,6 +1420,7 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
         struct file *file;
         int err;
         struct fuse_conn *fc;
+       struct fuse_mount *fm;
  
         err = -EINVAL;
         file = fget(ctx->fd);
@@ -1264,9 +1441,16 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
         if (!fc)
                 goto err_fput;
  
-       fuse_conn_init(fc, sb->s_user_ns, &fuse_dev_fiq_ops, NULL);
+       fm = kzalloc(sizeof(*fm), GFP_KERNEL);
+       if (!fm) {
+               kfree(fc);
+               goto err_fput;
+       }
+
+       fuse_conn_init(fc, fm, sb->s_user_ns, &fuse_dev_fiq_ops, NULL);
         fc->release = fuse_free_conn;
-       sb->s_fs_info = fc;
+
+       sb->s_fs_info = fm;
  
         err = fuse_fill_super_common(sb, ctx);
         if (err)
@@ -1277,11 +1461,11 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
          * CPUs after this
          */
         fput(file);
-       fuse_send_init(get_fuse_conn_super(sb));
+       fuse_send_init(get_fuse_mount_super(sb));
         return 0;
  
   err_put_conn:
-       fuse_conn_put(fc);
+       fuse_mount_put(fm);
         sb->s_fs_info = NULL;
   err_fput:
         fput(file);
@@ -1325,6 +1509,7 @@ static int fuse_init_fs_context(struct fs_context *fc)
  
         ctx->max_read = ~0;
         ctx->blksize = FUSE_DEFAULT_BLKSIZE;
+       ctx->legacy_opts_show = true;
  
  #ifdef CONFIG_BLOCK
         if (fc->fs_type == &fuseblk_fs_type) {
@@ -1338,29 +1523,52 @@ static int fuse_init_fs_context(struct fs_context *fc)
         return 0;
  }
  
-static void fuse_sb_destroy(struct super_block *sb)
+bool fuse_mount_remove(struct fuse_mount *fm)
  {
-       struct fuse_conn *fc = get_fuse_conn_super(sb);
+       struct fuse_conn *fc = fm->fc;
+       bool last = false;
  
-       if (fc) {
-               if (fc->destroy)
-                       fuse_send_destroy(fc);
+       down_write(&fc->killsb);
+       list_del_init(&fm->fc_entry);
+       if (list_empty(&fc->mounts))
+               last = true;
+       up_write(&fc->killsb);
  
-               fuse_abort_conn(fc);
-               fuse_wait_aborted(fc);
+       return last;
+}
+EXPORT_SYMBOL_GPL(fuse_mount_remove);
  
-               down_write(&fc->killsb);
-               fc->sb = NULL;
-               up_write(&fc->killsb);
+void fuse_conn_destroy(struct fuse_mount *fm)
+{
+       struct fuse_conn *fc = fm->fc;
+
+       if (fc->destroy)
+               fuse_send_destroy(fm);
+
+       fuse_abort_conn(fc);
+       fuse_wait_aborted(fc);
+
+       if (!list_empty(&fc->entry)) {
+               mutex_lock(&fuse_mutex);
+               list_del(&fc->entry);
+               fuse_ctl_remove_conn(fc);
+               mutex_unlock(&fuse_mutex);
         }
  }
+EXPORT_SYMBOL_GPL(fuse_conn_destroy);
  
-void fuse_kill_sb_anon(struct super_block *sb)
+static void fuse_kill_sb_anon(struct super_block *sb)
  {
-       fuse_sb_destroy(sb);
+       struct fuse_mount *fm = get_fuse_mount_super(sb);
+       bool last;
+
+       if (fm) {
+               last = fuse_mount_remove(fm);
+               if (last)
+                       fuse_conn_destroy(fm);
+       }
         kill_anon_super(sb);
  }
-EXPORT_SYMBOL_GPL(fuse_kill_sb_anon);
  
  static struct file_system_type fuse_fs_type = {
         .owner          = THIS_MODULE,
@@ -1375,7 +1583,14 @@ MODULE_ALIAS_FS("fuse");
  #ifdef CONFIG_BLOCK
  static void fuse_kill_sb_blk(struct super_block *sb)
  {
-       fuse_sb_destroy(sb);
+       struct fuse_mount *fm = get_fuse_mount_super(sb);
+       bool last;
+
+       if (fm) {
+               last = fuse_mount_remove(fm);
+               if (last)
+                       fuse_conn_destroy(fm);
+       }
         kill_block_super(sb);
  }
  
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c

index 90e3f01..3b5e910 100644 (file)
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -252,7 +252,7 @@ retry:
  static void fuse_force_forget(struct file *file, u64 nodeid)
  {
         struct inode *inode = file_inode(file);
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
         struct fuse_forget_in inarg;
         FUSE_ARGS(args);
  
@@ -266,7 +266,7 @@ static void fuse_force_forget(struct file *file, u64 nodeid)
         args.force = true;
         args.noreply = true;
  
-       fuse_simple_request(fc, &args);
+       fuse_simple_request(fm, &args);
         /* ignore errors */
  }
  
@@ -320,7 +320,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
         ssize_t res;
         struct page *page;
         struct inode *inode = file_inode(file);
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
         struct fuse_io_args ia = {};
         struct fuse_args_pages *ap = &ia.ap;
         struct fuse_page_desc desc = { .length = PAGE_SIZE };
@@ -337,7 +337,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
         ap->pages = &page;
         ap->descs = &desc;
         if (plus) {
-               attr_version = fuse_get_attr_version(fc);
+               attr_version = fuse_get_attr_version(fm->fc);
                 fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
                                     FUSE_READDIRPLUS);
         } else {
@@ -345,7 +345,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
                                     FUSE_READDIR);
         }
         locked = fuse_lock_inode(inode);
-       res = fuse_simple_request(fc, &ap->args);
+       res = fuse_simple_request(fm, &ap->args);
         fuse_unlock_inode(inode, locked);
         if (res >= 0) {
                 if (!res) {
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c

index 104f35d..21a9e53 100644 (file)
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -5,12 +5,17 @@
   */
  
  #include <linux/fs.h>
+#include <linux/dax.h>
+#include <linux/pci.h>
+#include <linux/pfn_t.h>
  #include <linux/module.h>
  #include <linux/virtio.h>
  #include <linux/virtio_fs.h>
  #include <linux/delay.h>
  #include <linux/fs_context.h>
+#include <linux/fs_parser.h>
  #include <linux/highmem.h>
+#include <linux/uio.h>
  #include "fuse_i.h"
  
  /* List of virtio-fs device instances and a lock for the list. Also provides
@@ -24,6 +29,8 @@ enum {
         VQ_REQUEST
  };
  
+#define VQ_NAME_LEN    24
+
  /* Per-virtqueue state */
  struct virtio_fs_vq {
         spinlock_t lock;
@@ -36,7 +43,7 @@ struct virtio_fs_vq {
         bool connected;
         long in_flight;
         struct completion in_flight_zero; /* No inflight requests */
-       char name[24];
+       char name[VQ_NAME_LEN];
  } ____cacheline_aligned_in_smp;
  
  /* A virtio-fs device instance */
@@ -47,6 +54,12 @@ struct virtio_fs {
         struct virtio_fs_vq *vqs;
         unsigned int nvqs;               /* number of virtqueues */
         unsigned int num_request_queues; /* number of request queues */
+       struct dax_device *dax_dev;
+
+       /* DAX memory window where file contents are mapped */
+       void *window_kaddr;
+       phys_addr_t window_phys_addr;
+       size_t window_len;
  };
  
  struct virtio_fs_forget_req {
@@ -69,6 +82,44 @@ struct virtio_fs_req_work {
  static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
                                  struct fuse_req *req, bool in_flight);
  
+enum {
+       OPT_DAX,
+};
+
+static const struct fs_parameter_spec virtio_fs_parameters[] = {
+       fsparam_flag("dax", OPT_DAX),
+       {}
+};
+
+static int virtio_fs_parse_param(struct fs_context *fc,
+                                struct fs_parameter *param)
+{
+       struct fs_parse_result result;
+       struct fuse_fs_context *ctx = fc->fs_private;
+       int opt;
+
+       opt = fs_parse(fc, virtio_fs_parameters, param, &result);
+       if (opt < 0)
+               return opt;
+
+       switch (opt) {
+       case OPT_DAX:
+               ctx->dax = 1;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static void virtio_fs_free_fc(struct fs_context *fc)
+{
+       struct fuse_fs_context *ctx = fc->fs_private;
+
+       kfree(ctx);
+}
+
  static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq)
  {
         struct virtio_fs *fs = vq->vdev->priv;
@@ -289,7 +340,6 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work)
         struct fuse_req *req;
         struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
                                                  dispatch_work.work);
-       struct fuse_conn *fc = fsvq->fud->fc;
         int ret;
  
         pr_debug("virtio-fs: worker %s called.\n", __func__);
@@ -304,7 +354,7 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work)
  
                 list_del_init(&req->list);
                 spin_unlock(&fsvq->lock);
-               fuse_request_end(fc, req);
+               fuse_request_end(req);
         }
  
         /* Dispatch pending requests */
@@ -335,7 +385,7 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work)
                         spin_unlock(&fsvq->lock);
                         pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n",
                                ret);
-                       fuse_request_end(fc, req);
+                       fuse_request_end(req);
                 }
         }
  }
@@ -495,7 +545,6 @@ static void virtio_fs_request_complete(struct fuse_req *req,
                                        struct virtio_fs_vq *fsvq)
  {
         struct fuse_pqueue *fpq = &fsvq->fud->pq;
-       struct fuse_conn *fc = fsvq->fud->fc;
         struct fuse_args *args;
         struct fuse_args_pages *ap;
         unsigned int len, i, thislen;
@@ -528,7 +577,7 @@ static void virtio_fs_request_complete(struct fuse_req *req,
         clear_bit(FR_SENT, &req->flags);
         spin_unlock(&fpq->lock);
  
-       fuse_request_end(fc, req);
+       fuse_request_end(req);
         spin_lock(&fsvq->lock);
         dec_in_flight_req(fsvq);
         spin_unlock(&fsvq->lock);
@@ -596,6 +645,26 @@ static void virtio_fs_vq_done(struct virtqueue *vq)
         schedule_work(&fsvq->done_work);
  }
  
+static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name,
+                             int vq_type)
+{
+       strncpy(fsvq->name, name, VQ_NAME_LEN);
+       spin_lock_init(&fsvq->lock);
+       INIT_LIST_HEAD(&fsvq->queued_reqs);
+       INIT_LIST_HEAD(&fsvq->end_reqs);
+       init_completion(&fsvq->in_flight_zero);
+
+       if (vq_type == VQ_REQUEST) {
+               INIT_WORK(&fsvq->done_work, virtio_fs_requests_done_work);
+               INIT_DELAYED_WORK(&fsvq->dispatch_work,
+                                 virtio_fs_request_dispatch_work);
+       } else {
+               INIT_WORK(&fsvq->done_work, virtio_fs_hiprio_done_work);
+               INIT_DELAYED_WORK(&fsvq->dispatch_work,
+                                 virtio_fs_hiprio_dispatch_work);
+       }
+}
+
  /* Initialize virtqueues */
  static int virtio_fs_setup_vqs(struct virtio_device *vdev,
                                struct virtio_fs *fs)
@@ -611,7 +680,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
         if (fs->num_request_queues == 0)
                 return -EINVAL;
  
-       fs->nvqs = 1 + fs->num_request_queues;
+       fs->nvqs = VQ_REQUEST + fs->num_request_queues;
         fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL);
         if (!fs->vqs)
                 return -ENOMEM;
@@ -625,29 +694,17 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
                 goto out;
         }
  
+       /* Initialize the hiprio/forget request virtqueue */
         callbacks[VQ_HIPRIO] = virtio_fs_vq_done;
-       snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name),
-                       "hiprio");
+       virtio_fs_init_vq(&fs->vqs[VQ_HIPRIO], "hiprio", VQ_HIPRIO);
         names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name;
-       INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work);
-       INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs);
-       INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].end_reqs);
-       INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work,
-                       virtio_fs_hiprio_dispatch_work);
-       init_completion(&fs->vqs[VQ_HIPRIO].in_flight_zero);
-       spin_lock_init(&fs->vqs[VQ_HIPRIO].lock);
  
         /* Initialize the requests virtqueues */
         for (i = VQ_REQUEST; i < fs->nvqs; i++) {
-               spin_lock_init(&fs->vqs[i].lock);
-               INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work);
-               INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work,
-                                 virtio_fs_request_dispatch_work);
-               INIT_LIST_HEAD(&fs->vqs[i].queued_reqs);
-               INIT_LIST_HEAD(&fs->vqs[i].end_reqs);
-               init_completion(&fs->vqs[i].in_flight_zero);
-               snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name),
-                        "requests.%u", i - VQ_REQUEST);
+               char vq_name[VQ_NAME_LEN];
+
+               snprintf(vq_name, VQ_NAME_LEN, "requests.%u", i - VQ_REQUEST);
+               virtio_fs_init_vq(&fs->vqs[i], vq_name, VQ_REQUEST);
                 callbacks[i] = virtio_fs_vq_done;
                 names[i] = fs->vqs[i].name;
         }
@@ -676,6 +733,130 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
         vdev->config->del_vqs(vdev);
  }
  
+/* Map a window offset to a page frame number.  The window offset will have
+ * been produced by .iomap_begin(), which maps a file offset to a window
+ * offset.
+ */
+static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
+                                   long nr_pages, void **kaddr, pfn_t *pfn)
+{
+       struct virtio_fs *fs = dax_get_private(dax_dev);
+       phys_addr_t offset = PFN_PHYS(pgoff);
+       size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff;
+
+       if (kaddr)
+               *kaddr = fs->window_kaddr + offset;
+       if (pfn)
+               *pfn = phys_to_pfn_t(fs->window_phys_addr + offset,
+                                       PFN_DEV | PFN_MAP);
+       return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
+}
+
+static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev,
+                                      pgoff_t pgoff, void *addr,
+                                      size_t bytes, struct iov_iter *i)
+{
+       return copy_from_iter(addr, bytes, i);
+}
+
+static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev,
+                                      pgoff_t pgoff, void *addr,
+                                      size_t bytes, struct iov_iter *i)
+{
+       return copy_to_iter(addr, bytes, i);
+}
+
+static int virtio_fs_zero_page_range(struct dax_device *dax_dev,
+                                    pgoff_t pgoff, size_t nr_pages)
+{
+       long rc;
+       void *kaddr;
+
+       rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL);
+       if (rc < 0)
+               return rc;
+       memset(kaddr, 0, nr_pages << PAGE_SHIFT);
+       dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT);
+       return 0;
+}
+
+static const struct dax_operations virtio_fs_dax_ops = {
+       .direct_access = virtio_fs_direct_access,
+       .copy_from_iter = virtio_fs_copy_from_iter,
+       .copy_to_iter = virtio_fs_copy_to_iter,
+       .zero_page_range = virtio_fs_zero_page_range,
+};
+
+static void virtio_fs_cleanup_dax(void *data)
+{
+       struct dax_device *dax_dev = data;
+
+       kill_dax(dax_dev);
+       put_dax(dax_dev);
+}
+
+static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
+{
+       struct virtio_shm_region cache_reg;
+       struct dev_pagemap *pgmap;
+       bool have_cache;
+
+       if (!IS_ENABLED(CONFIG_FUSE_DAX))
+               return 0;
+
+       /* Get cache region */
+       have_cache = virtio_get_shm_region(vdev, &cache_reg,
+                                          (u8)VIRTIO_FS_SHMCAP_ID_CACHE);
+       if (!have_cache) {
+               dev_notice(&vdev->dev, "%s: No cache capability\n", __func__);
+               return 0;
+       }
+
+       if (!devm_request_mem_region(&vdev->dev, cache_reg.addr, cache_reg.len,
+                                    dev_name(&vdev->dev))) {
+               dev_warn(&vdev->dev, "could not reserve region addr=0x%llx len=0x%llx\n",
+                        cache_reg.addr, cache_reg.len);
+               return -EBUSY;
+       }
+
+       dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n", cache_reg.len,
+                  cache_reg.addr);
+
+       pgmap = devm_kzalloc(&vdev->dev, sizeof(*pgmap), GFP_KERNEL);
+       if (!pgmap)
+               return -ENOMEM;
+
+       pgmap->type = MEMORY_DEVICE_FS_DAX;
+
+       /* Ideally we would directly use the PCI BAR resource but
+        * devm_memremap_pages() wants its own copy in pgmap.  So
+        * initialize a struct resource from scratch (only the start
+        * and end fields will be used).
+        */
+       pgmap->range = (struct range) {
+               .start = (phys_addr_t) cache_reg.addr,
+               .end = (phys_addr_t) cache_reg.addr + cache_reg.len - 1,
+       };
+       pgmap->nr_range = 1;
+
+       fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap);
+       if (IS_ERR(fs->window_kaddr))
+               return PTR_ERR(fs->window_kaddr);
+
+       fs->window_phys_addr = (phys_addr_t) cache_reg.addr;
+       fs->window_len = (phys_addr_t) cache_reg.len;
+
+       dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n",
+               __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len);
+
+       fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops, 0);
+       if (IS_ERR(fs->dax_dev))
+               return PTR_ERR(fs->dax_dev);
+
+       return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax,
+                                       fs->dax_dev);
+}
+
  static int virtio_fs_probe(struct virtio_device *vdev)
  {
         struct virtio_fs *fs;
@@ -697,6 +878,10 @@ static int virtio_fs_probe(struct virtio_device *vdev)
  
         /* TODO vq affinity */
  
+       ret = virtio_fs_setup_dax(vdev, fs);
+       if (ret < 0)
+               goto out_vqs;
+
         /* Bring the device online in case the filesystem is mounted and
          * requests need to be sent before we return.
          */
@@ -833,18 +1018,37 @@ __releases(fiq->lock)
         spin_unlock(&fiq->lock);
  }
  
+/* Count number of scatter-gather elements required */
+static unsigned int sg_count_fuse_pages(struct fuse_page_desc *page_descs,
+                                      unsigned int num_pages,
+                                      unsigned int total_len)
+{
+       unsigned int i;
+       unsigned int this_len;
+
+       for (i = 0; i < num_pages && total_len; i++) {
+               this_len =  min(page_descs[i].length, total_len);
+               total_len -= this_len;
+       }
+
+       return i;
+}
+
  /* Return the number of scatter-gather list elements required */
  static unsigned int sg_count_fuse_req(struct fuse_req *req)
  {
         struct fuse_args *args = req->args;
         struct fuse_args_pages *ap = container_of(args, typeof(*ap), args);
-       unsigned int total_sgs = 1 /* fuse_in_header */;
+       unsigned int size, total_sgs = 1 /* fuse_in_header */;
  
         if (args->in_numargs - args->in_pages)
                 total_sgs += 1;
  
-       if (args->in_pages)
-               total_sgs += ap->num_pages;
+       if (args->in_pages) {
+               size = args->in_args[args->in_numargs - 1].size;
+               total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages,
+                                                size);
+       }
  
         if (!test_bit(FR_ISREPLY, &req->flags))
                 return total_sgs;
@@ -854,8 +1058,11 @@ static unsigned int sg_count_fuse_req(struct fuse_req *req)
         if (args->out_numargs - args->out_pages)
                 total_sgs += 1;
  
-       if (args->out_pages)
-               total_sgs += ap->num_pages;
+       if (args->out_pages) {
+               size = args->out_args[args->out_numargs - 1].size;
+               total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages,
+                                                size);
+       }
  
         return total_sgs;
  }
@@ -1071,24 +1278,28 @@ static const struct fuse_iqueue_ops virtio_fs_fiq_ops = {
         .release                        = virtio_fs_fiq_release,
  };
  
-static int virtio_fs_fill_super(struct super_block *sb)
+static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx)
  {
-       struct fuse_conn *fc = get_fuse_conn_super(sb);
+       ctx->rootmode = S_IFDIR;
+       ctx->default_permissions = 1;
+       ctx->allow_other = 1;
+       ctx->max_read = UINT_MAX;
+       ctx->blksize = 512;
+       ctx->destroy = true;
+       ctx->no_control = true;
+       ctx->no_force_umount = true;
+}
+
+static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc)
+{
+       struct fuse_mount *fm = get_fuse_mount_super(sb);
+       struct fuse_conn *fc = fm->fc;
         struct virtio_fs *fs = fc->iq.priv;
+       struct fuse_fs_context *ctx = fsc->fs_private;
         unsigned int i;
         int err;
-       struct fuse_fs_context ctx = {
-               .rootmode = S_IFDIR,
-               .default_permissions = 1,
-               .allow_other = 1,
-               .max_read = UINT_MAX,
-               .blksize = 512,
-               .destroy = true,
-               .no_control = true,
-               .no_force_umount = true,
-               .no_mount_options = true,
-       };
  
+       virtio_fs_ctx_set_defaults(ctx);
         mutex_lock(&virtio_fs_mutex);
  
         /* After holding mutex, make sure virtiofs device is still there.
@@ -1112,8 +1323,10 @@ static int virtio_fs_fill_super(struct super_block *sb)
         }
  
         /* virtiofs allocates and installs its own fuse devices */
-       ctx.fudptr = NULL;
-       err = fuse_fill_super_common(sb, &ctx);
+       ctx->fudptr = NULL;
+       if (ctx->dax)
+               ctx->dax_dev = fs->dax_dev;
+       err = fuse_fill_super_common(sb, ctx);
         if (err < 0)
                 goto err_free_fuse_devs;
  
@@ -1125,7 +1338,7 @@ static int virtio_fs_fill_super(struct super_block *sb)
  
         /* Previous unmount will stop all queues. Start these again */
         virtio_fs_start_all_queues(fs);
-       fuse_send_init(fc);
+       fuse_send_init(fm);
         mutex_unlock(&virtio_fs_mutex);
         return 0;
  
@@ -1136,18 +1349,17 @@ err:
         return err;
  }
  
-static void virtio_kill_sb(struct super_block *sb)
+static void virtio_fs_conn_destroy(struct fuse_mount *fm)
  {
-       struct fuse_conn *fc = get_fuse_conn_super(sb);
-       struct virtio_fs *vfs;
-       struct virtio_fs_vq *fsvq;
-
-       /* If mount failed, we can still be called without any fc */
-       if (!fc)
-               return fuse_kill_sb_anon(sb);
+       struct fuse_conn *fc = fm->fc;
+       struct virtio_fs *vfs = fc->iq.priv;
+       struct virtio_fs_vq *fsvq = &vfs->vqs[VQ_HIPRIO];
  
-       vfs = fc->iq.priv;
-       fsvq = &vfs->vqs[VQ_HIPRIO];
+       /* Stop dax worker. Soon evict_inodes() will be called which
+        * will free all memory ranges belonging to all inodes.
+        */
+       if (IS_ENABLED(CONFIG_FUSE_DAX))
+               fuse_dax_cancel_work(fc);
  
         /* Stop forget queue. Soon destroy will be sent */
         spin_lock(&fsvq->lock);
@@ -1155,9 +1367,9 @@ static void virtio_kill_sb(struct super_block *sb)
         spin_unlock(&fsvq->lock);
         virtio_fs_drain_all_queues(vfs);
  
-       fuse_kill_sb_anon(sb);
+       fuse_conn_destroy(fm);
  
-       /* fuse_kill_sb_anon() must have sent destroy. Stop all queues
+       /* fuse_conn_destroy() must have sent destroy. Stop all queues
          * and drain one more time and free fuse devices. Freeing fuse
          * devices will drop their reference on fuse_conn and that in
          * turn will drop its reference on virtio_fs object.
@@ -1167,12 +1379,27 @@ static void virtio_kill_sb(struct super_block *sb)
         virtio_fs_free_devs(vfs);
  }
  
+static void virtio_kill_sb(struct super_block *sb)
+{
+       struct fuse_mount *fm = get_fuse_mount_super(sb);
+       bool last;
+
+       /* If mount failed, we can still be called without any fc */
+       if (fm) {
+               last = fuse_mount_remove(fm);
+               if (last)
+                       virtio_fs_conn_destroy(fm);
+       }
+       kill_anon_super(sb);
+}
+
  static int virtio_fs_test_super(struct super_block *sb,
                                 struct fs_context *fsc)
  {
-       struct fuse_conn *fc = fsc->s_fs_info;
+       struct fuse_mount *fsc_fm = fsc->s_fs_info;
+       struct fuse_mount *sb_fm = get_fuse_mount_super(sb);
  
-       return fc->iq.priv == get_fuse_conn_super(sb)->iq.priv;
+       return fsc_fm->fc->iq.priv == sb_fm->fc->iq.priv;
  }
  
  static int virtio_fs_set_super(struct super_block *sb,
@@ -1182,7 +1409,7 @@ static int virtio_fs_set_super(struct super_block *sb,
  
         err = get_anon_bdev(&sb->s_dev);
         if (!err)
-               fuse_conn_get(fsc->s_fs_info);
+               fuse_mount_get(fsc->s_fs_info);
  
         return err;
  }
@@ -1192,6 +1419,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
         struct virtio_fs *fs;
         struct super_block *sb;
         struct fuse_conn *fc;
+       struct fuse_mount *fm;
         int err;
  
         /* This gets a reference on virtio_fs object. This ptr gets installed
@@ -1212,19 +1440,29 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
                 return -ENOMEM;
         }
  
-       fuse_conn_init(fc, get_user_ns(current_user_ns()), &virtio_fs_fiq_ops,
-                      fs);
+       fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL);
+       if (!fm) {
+               mutex_lock(&virtio_fs_mutex);
+               virtio_fs_put(fs);
+               mutex_unlock(&virtio_fs_mutex);
+               kfree(fc);
+               return -ENOMEM;
+       }
+
+       fuse_conn_init(fc, fm, get_user_ns(current_user_ns()),
+                      &virtio_fs_fiq_ops, fs);
         fc->release = fuse_free_conn;
         fc->delete_stale = true;
+       fc->auto_submounts = true;
  
-       fsc->s_fs_info = fc;
+       fsc->s_fs_info = fm;
         sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super);
-       fuse_conn_put(fc);
+       fuse_mount_put(fm);
         if (IS_ERR(sb))
                 return PTR_ERR(sb);
  
         if (!sb->s_root) {
-               err = virtio_fs_fill_super(sb);
+               err = virtio_fs_fill_super(sb, fsc);
                 if (err) {
                         deactivate_locked_super(sb);
                         return err;
@@ -1239,11 +1477,19 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
  }
  
  static const struct fs_context_operations virtio_fs_context_ops = {
+       .free           = virtio_fs_free_fc,
+       .parse_param    = virtio_fs_parse_param,
         .get_tree       = virtio_fs_get_tree,
  };
  
  static int virtio_fs_init_fs_context(struct fs_context *fsc)
  {
+       struct fuse_fs_context *ctx;
+
+       ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL);
+       if (!ctx)
+               return -ENOMEM;
+       fsc->fs_private = ctx;
         fsc->ops = &virtio_fs_context_ops;
         return 0;
  }
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c

index 20d052e..371bdcb 100644 (file)
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -14,12 +14,12 @@
  int fuse_setxattr(struct inode *inode, const char *name, const void *value,
                   size_t size, int flags)
  {
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
         FUSE_ARGS(args);
         struct fuse_setxattr_in inarg;
         int err;
  
-       if (fc->no_setxattr)
+       if (fm->fc->no_setxattr)
                 return -EOPNOTSUPP;
  
         memset(&inarg, 0, sizeof(inarg));
@@ -34,9 +34,9 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value,
         args.in_args[1].value = name;
         args.in_args[2].size = size;
         args.in_args[2].value = value;
-       err = fuse_simple_request(fc, &args);
+       err = fuse_simple_request(fm, &args);
         if (err == -ENOSYS) {
-               fc->no_setxattr = 1;
+               fm->fc->no_setxattr = 1;
                 err = -EOPNOTSUPP;
         }
         if (!err) {
@@ -49,13 +49,13 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value,
  ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
                       size_t size)
  {
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
         FUSE_ARGS(args);
         struct fuse_getxattr_in inarg;
         struct fuse_getxattr_out outarg;
         ssize_t ret;
  
-       if (fc->no_getxattr)
+       if (fm->fc->no_getxattr)
                 return -EOPNOTSUPP;
  
         memset(&inarg, 0, sizeof(inarg));
@@ -77,11 +77,11 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
                 args.out_args[0].size = sizeof(outarg);
                 args.out_args[0].value = &outarg;
         }
-       ret = fuse_simple_request(fc, &args);
+       ret = fuse_simple_request(fm, &args);
         if (!ret && !size)
                 ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX);
         if (ret == -ENOSYS) {
-               fc->no_getxattr = 1;
+               fm->fc->no_getxattr = 1;
                 ret = -EOPNOTSUPP;
         }
         return ret;
@@ -107,16 +107,16 @@ static int fuse_verify_xattr_list(char *list, size_t size)
  ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
  {
         struct inode *inode = d_inode(entry);
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
         FUSE_ARGS(args);
         struct fuse_getxattr_in inarg;
         struct fuse_getxattr_out outarg;
         ssize_t ret;
  
-       if (!fuse_allow_current_process(fc))
+       if (!fuse_allow_current_process(fm->fc))
                 return -EACCES;
  
-       if (fc->no_listxattr)
+       if (fm->fc->no_listxattr)
                 return -EOPNOTSUPP;
  
         memset(&inarg, 0, sizeof(inarg));
@@ -136,13 +136,13 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
                 args.out_args[0].size = sizeof(outarg);
                 args.out_args[0].value = &outarg;
         }
-       ret = fuse_simple_request(fc, &args);
+       ret = fuse_simple_request(fm, &args);
         if (!ret && !size)
                 ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX);
         if (ret > 0 && size)
                 ret = fuse_verify_xattr_list(list, ret);
         if (ret == -ENOSYS) {
-               fc->no_listxattr = 1;
+               fm->fc->no_listxattr = 1;
                 ret = -EOPNOTSUPP;
         }
         return ret;
@@ -150,11 +150,11 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
  
  int fuse_removexattr(struct inode *inode, const char *name)
  {
-       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_mount *fm = get_fuse_mount(inode);
         FUSE_ARGS(args);
         int err;
  
-       if (fc->no_removexattr)
+       if (fm->fc->no_removexattr)
                 return -EOPNOTSUPP;
  
         args.opcode = FUSE_REMOVEXATTR;
@@ -162,9 +162,9 @@ int fuse_removexattr(struct inode *inode, const char *name)
         args.in_numargs = 1;
         args.in_args[0].size = strlen(name) + 1;
         args.in_args[0].value = name;
-       err = fuse_simple_request(fc, &args);
+       err = fuse_simple_request(fm, &args);
         if (err == -ENOSYS) {
-               fc->no_removexattr = 1;
+               fm->fc->no_removexattr = 1;
                 err = -EOPNOTSUPP;
         }
         if (!err) {
diff --git a/include/linux/dax.h b/include/linux/dax.h

index e153572..b52f084 100644 (file)
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -149,6 +149,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
                 struct dax_device *dax_dev, struct writeback_control *wbc);
  
  struct page *dax_layout_busy_page(struct address_space *mapping);
+struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
  dax_entry_t dax_lock_page(struct page *page);
  void dax_unlock_page(struct page *page, dax_entry_t cookie);
  #else
@@ -179,6 +180,11 @@ static inline struct page *dax_layout_busy_page(struct address_space *mapping)
         return NULL;
  }
  
+static inline struct page *dax_layout_busy_page_range(struct address_space *mapping, pgoff_t start, pgoff_t nr_pages)
+{
+       return NULL;
+}
+
  static inline int dax_writeback_mapping_range(struct address_space *mapping,
                 struct dax_device *dax_dev, struct writeback_control *wbc)
  {
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h

index 373cada..7233502 100644 (file)
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -172,6 +172,9 @@
   *  - add FUSE_WRITE_KILL_PRIV flag
   *  - add FUSE_SETUPMAPPING and FUSE_REMOVEMAPPING
   *  - add map_alignment to fuse_init_out, add FUSE_MAP_ALIGNMENT flag
+ *
+ *  7.32
+ *  - add flags to fuse_attr, add FUSE_ATTR_SUBMOUNT, add FUSE_SUBMOUNTS
   */
  
  #ifndef _LINUX_FUSE_H
@@ -207,7 +210,7 @@
  #define FUSE_KERNEL_VERSION 7
  
  /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 31
+#define FUSE_KERNEL_MINOR_VERSION 32
  
  /** The node ID of the root inode */
  #define FUSE_ROOT_ID 1
@@ -231,7 +234,7 @@ struct fuse_attr {
         uint32_t        gid;
         uint32_t        rdev;
         uint32_t        blksize;
-       uint32_t        padding;
+       uint32_t        flags;
  };
  
  struct fuse_kstatfs {
@@ -313,7 +316,10 @@ struct fuse_file_lock {
   * FUSE_CACHE_SYMLINKS: cache READLINK responses
   * FUSE_NO_OPENDIR_SUPPORT: kernel supports zero-message opendir
   * FUSE_EXPLICIT_INVAL_DATA: only invalidate cached pages on explicit request
- * FUSE_MAP_ALIGNMENT: map_alignment field is valid
+ * FUSE_MAP_ALIGNMENT: init_out.map_alignment contains log2(byte alignment) for
+ *                    foffset and moffset fields in struct
+ *                    fuse_setupmapping_out and fuse_removemapping_one.
+ * FUSE_SUBMOUNTS: kernel supports auto-mounting directory submounts
   */
  #define FUSE_ASYNC_READ                (1 << 0)
  #define FUSE_POSIX_LOCKS       (1 << 1)
@@ -342,6 +348,7 @@ struct fuse_file_lock {
  #define FUSE_NO_OPENDIR_SUPPORT (1 << 24)
  #define FUSE_EXPLICIT_INVAL_DATA (1 << 25)
  #define FUSE_MAP_ALIGNMENT     (1 << 26)
+#define FUSE_SUBMOUNTS         (1 << 27)
  
  /**
   * CUSE INIT request/reply flags
@@ -417,6 +424,13 @@ struct fuse_file_lock {
   */
  #define FUSE_FSYNC_FDATASYNC   (1 << 0)
  
+/**
+ * fuse_attr flags
+ *
+ * FUSE_ATTR_SUBMOUNT: Object is a submount root
+ */
+#define FUSE_ATTR_SUBMOUNT      (1 << 0)
+
  enum fuse_opcode {
         FUSE_LOOKUP             = 1,
         FUSE_FORGET             = 2,  /* no reply */
@@ -892,4 +906,34 @@ struct fuse_copy_file_range_in {
         uint64_t        flags;
  };
  
+#define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0)
+#define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1)
+struct fuse_setupmapping_in {
+       /* An already open handle */
+       uint64_t        fh;
+       /* Offset into the file to start the mapping */
+       uint64_t        foffset;
+       /* Length of mapping required */
+       uint64_t        len;
+       /* Flags, FUSE_SETUPMAPPING_FLAG_* */
+       uint64_t        flags;
+       /* Offset in Memory Window */
+       uint64_t        moffset;
+};
+
+struct fuse_removemapping_in {
+       /* number of fuse_removemapping_one follows */
+       uint32_t        count;
+};
+
+struct fuse_removemapping_one {
+       /* Offset into the dax window start the unmapping */
+       uint64_t        moffset;
+       /* Length of mapping required */
+       uint64_t        len;
+};
+
+#define FUSE_REMOVEMAPPING_MAX_ENTRY   \
+               (PAGE_SIZE / sizeof(struct fuse_removemapping_one))
+
  #endif /* _LINUX_FUSE_H */
diff --git a/include/uapi/linux/virtio_fs.h b/include/uapi/linux/virtio_fs.h

index 3056b6e..bea3829 100644 (file)
--- a/include/uapi/linux/virtio_fs.h
+++ b/include/uapi/linux/virtio_fs.h
@@ -16,4 +16,7 @@ struct virtio_fs_config {
         __le32 num_request_queues;
  } __attribute__((packed));
  
+/* For the id field in virtio_pci_shm_cap */
+#define VIRTIO_FS_SHMCAP_ID_CACHE 0
+
  #endif /* _UAPI_LINUX_VIRTIO_FS_H */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 19 Oct 2020 21:28:30 +0000 (14:28 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 19 Oct 2020 21:28:30 +0000 (14:28 -0700)
Documentation/filesystems/fuse.rst		patch \| blob \| history
MAINTAINERS		patch \| blob \| history
drivers/dax/super.c		patch \| blob \| history
fs/dax.c		patch \| blob \| history
fs/fuse/Kconfig		patch \| blob \| history
fs/fuse/Makefile		patch \| blob \| history
fs/fuse/control.c		patch \| blob \| history
fs/fuse/cuse.c		patch \| blob \| history
fs/fuse/dax.c	[new file with mode: 0644]	patch \| blob
fs/fuse/dev.c		patch \| blob \| history
fs/fuse/dir.c		patch \| blob \| history
fs/fuse/file.c		patch \| blob \| history
fs/fuse/fuse_i.h		patch \| blob \| history
fs/fuse/inode.c		patch \| blob \| history
fs/fuse/readdir.c		patch \| blob \| history
fs/fuse/virtio_fs.c		patch \| blob \| history
fs/fuse/xattr.c		patch \| blob \| history
include/linux/dax.h		patch \| blob \| history
include/uapi/linux/fuse.h		patch \| blob \| history
include/uapi/linux/virtio_fs.h		patch \| blob \| history