using the sftp protocol.
The userspace library and utilities are available from the
-`FUSE homepage: <http://fuse.sourceforge.net/>`_
+`FUSE homepage: <https://github.com/libfuse/>`_
Filesystem type
===============
M: Miklos Szeredi <miklos@szeredi.hu>
L: linux-fsdevel@vger.kernel.org
S: Maintained
-W: http://fuse.sourceforge.net/
+W: https://github.com/libfuse/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git
F: Documentation/filesystems/fuse.rst
F: fs/fuse/
int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
pgoff_t *pgoff)
{
- phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512;
+ sector_t start_sect = bdev ? get_start_sect(bdev) : 0;
+ phys_addr_t phys_off = (start_sect + sector) * 512;
if (pgoff)
*pgoff = PHYS_PFN(phys_off);
}
/**
- * dax_layout_busy_page - find first pinned page in @mapping
+ * dax_layout_busy_page_range - find first pinned page in @mapping
* @mapping: address space to scan for a page with ref count > 1
+ * @start: Starting offset. Page containing 'start' is included.
+ * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
+ * pages from 'start' till the end of file are included.
*
* DAX requires ZONE_DEVICE mapped pages. These pages are never
* 'onlined' to the page allocator so they are considered idle when
* to be able to run unmap_mapping_range() and subsequently not race
* mapping_mapped() becoming true.
*/
-struct page *dax_layout_busy_page(struct address_space *mapping)
+struct page *dax_layout_busy_page_range(struct address_space *mapping,
+ loff_t start, loff_t end)
{
- XA_STATE(xas, &mapping->i_pages, 0);
void *entry;
unsigned int scanned = 0;
struct page *page = NULL;
+ pgoff_t start_idx = start >> PAGE_SHIFT;
+ pgoff_t end_idx;
+ XA_STATE(xas, &mapping->i_pages, start_idx);
/*
* In the 'limited' case get_user_pages() for dax is disabled.
if (!dax_mapping(mapping) || !mapping_mapped(mapping))
return NULL;
+ /* If end == LLONG_MAX, all pages from start to till end of file */
+ if (end == LLONG_MAX)
+ end_idx = ULONG_MAX;
+ else
+ end_idx = end >> PAGE_SHIFT;
/*
* If we race get_user_pages_fast() here either we'll see the
* elevated page count in the iteration and wait, or
* against is no longer mapped in the page tables and bail to the
* get_user_pages() slow path. The slow path is protected by
* pte_lock() and pmd_lock(). New references are not taken without
- * holding those locks, and unmap_mapping_range() will not zero the
+ * holding those locks, and unmap_mapping_pages() will not zero the
* pte or pmd without holding the respective lock, so we are
* guaranteed to either see new references or prevent new
* references from being established.
*/
- unmap_mapping_range(mapping, 0, 0, 0);
+ unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
xas_lock_irq(&xas);
- xas_for_each(&xas, entry, ULONG_MAX) {
+ xas_for_each(&xas, entry, end_idx) {
if (WARN_ON_ONCE(!xa_is_value(entry)))
continue;
if (unlikely(dax_is_locked(entry)))
xas_unlock_irq(&xas);
return page;
}
+EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
+
+struct page *dax_layout_busy_page(struct address_space *mapping)
+{
+ return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
+}
EXPORT_SYMBOL_GPL(dax_layout_busy_page);
static int __dax_invalidate_entry(struct address_space *mapping,
There's also a companion library: libfuse2. This library is available
from the FUSE homepage:
- <http://fuse.sourceforge.net/>
+ <https://github.com/libfuse/>
although chances are your distribution already has that library
installed if you've installed the "fuse" package itself.
If you want to share files between guests or with the host, answer Y
or M.
+
+config FUSE_DAX
+ bool "Virtio Filesystem Direct Host Memory Access support"
+ default y
+ select INTERVAL_TREE
+ depends on VIRTIO_FS
+ depends on FS_DAX
+ depends on DAX_DRIVER
+ help
+ This allows bypassing guest page cache and allows mapping host page
+ cache directly in guest address space.
+
+ If you want to allow mounting a Virtio Filesystem with the "dax"
+ option, answer Y.
obj-$(CONFIG_CUSE) += cuse.o
obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
-fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
-virtiofs-y += virtio_fs.o
+fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
+fuse-$(CONFIG_FUSE_DAX) += dax.o
+
+virtiofs-y := virtio_fs.o
{
unsigned val;
struct fuse_conn *fc;
+ struct fuse_mount *fm;
ssize_t ret;
ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
if (!fc)
goto out;
+ down_read(&fc->killsb);
spin_lock(&fc->bg_lock);
fc->congestion_threshold = val;
- if (fc->sb) {
+
+ /*
+ * Get any fuse_mount belonging to this fuse_conn; s_bdi is
+ * shared between all of them
+ */
+
+ if (!list_empty(&fc->mounts)) {
+ fm = list_first_entry(&fc->mounts, struct fuse_mount, fc_entry);
if (fc->num_background < fc->congestion_threshold) {
- clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
- clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+ clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
+ clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
} else {
- set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
- set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+ set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
+ set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
}
}
spin_unlock(&fc->bg_lock);
+ up_read(&fc->killsb);
fuse_conn_put(fc);
out:
return ret;
struct cuse_conn {
struct list_head list; /* linked on cuse_conntbl */
+ struct fuse_mount fm; /* Dummy mount referencing fc */
struct fuse_conn fc; /* fuse connection */
struct cdev *cdev; /* associated character device */
struct device *dev; /* device representing @cdev */
* Generic permission check is already done against the chrdev
* file, proceed to open.
*/
- rc = fuse_do_open(&cc->fc, 0, file, 0);
+ rc = fuse_do_open(&cc->fm, 0, file, 0);
if (rc)
fuse_conn_put(&cc->fc);
return rc;
static int cuse_release(struct inode *inode, struct file *file)
{
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
fuse_sync_release(NULL, ff, file->f_flags);
- fuse_conn_put(fc);
+ fuse_conn_put(fm->fc);
return 0;
}
unsigned long arg)
{
struct fuse_file *ff = file->private_data;
- struct cuse_conn *cc = fc_to_cc(ff->fc);
+ struct cuse_conn *cc = fc_to_cc(ff->fm->fc);
unsigned int flags = 0;
if (cc->unrestricted_ioctl)
unsigned long arg)
{
struct fuse_file *ff = file->private_data;
- struct cuse_conn *cc = fc_to_cc(ff->fc);
+ struct cuse_conn *cc = fc_to_cc(ff->fm->fc);
unsigned int flags = FUSE_IOCTL_COMPAT;
if (cc->unrestricted_ioctl)
* required data structures for it. Please read the comment at the
* top of this file for high level overview.
*/
-static void cuse_process_init_reply(struct fuse_conn *fc,
+static void cuse_process_init_reply(struct fuse_mount *fm,
struct fuse_args *args, int error)
{
+ struct fuse_conn *fc = fm->fc;
struct cuse_init_args *ia = container_of(args, typeof(*ia), ap.args);
struct fuse_args_pages *ap = &ia->ap;
struct cuse_conn *cc = fc_to_cc(fc), *pos;
{
int rc;
struct page *page;
- struct fuse_conn *fc = &cc->fc;
+ struct fuse_mount *fm = &cc->fm;
struct cuse_init_args *ia;
struct fuse_args_pages *ap;
ia->desc.length = ap->args.out_args[1].size;
ap->args.end = cuse_process_init_reply;
- rc = fuse_simple_background(fc, &ap->args, GFP_KERNEL);
+ rc = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
if (rc) {
kfree(ia);
err_free_page:
* Limit the cuse channel to requests that can
* be represented in file->f_cred->user_ns.
*/
- fuse_conn_init(&cc->fc, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL);
+ fuse_conn_init(&cc->fc, &cc->fm, file->f_cred->user_ns,
+ &fuse_dev_fiq_ops, NULL);
fud = fuse_dev_alloc_install(&cc->fc);
if (!fud) {
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * dax: direct host memory access
+ * Copyright (C) 2020 Red Hat, Inc.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/delay.h>
+#include <linux/dax.h>
+#include <linux/uio.h>
+#include <linux/pfn_t.h>
+#include <linux/iomap.h>
+#include <linux/interval_tree.h>
+
+/*
+ * Default memory range size. A power of 2 so it agrees with common FUSE_INIT
+ * map_alignment values 4KB and 64KB.
+ */
+#define FUSE_DAX_SHIFT 21
+#define FUSE_DAX_SZ (1 << FUSE_DAX_SHIFT)
+#define FUSE_DAX_PAGES (FUSE_DAX_SZ / PAGE_SIZE)
+
+/* Number of ranges reclaimer will try to free in one invocation */
+#define FUSE_DAX_RECLAIM_CHUNK (10)
+
+/*
+ * Dax memory reclaim threshold in percetage of total ranges. When free
+ * number of free ranges drops below this threshold, reclaim can trigger
+ * Default is 20%
+ */
+#define FUSE_DAX_RECLAIM_THRESHOLD (20)
+
+/** Translation information for file offsets to DAX window offsets */
+struct fuse_dax_mapping {
+ /* Pointer to inode where this memory range is mapped */
+ struct inode *inode;
+
+ /* Will connect in fcd->free_ranges to keep track of free memory */
+ struct list_head list;
+
+ /* For interval tree in file/inode */
+ struct interval_tree_node itn;
+
+ /* Will connect in fc->busy_ranges to keep track busy memory */
+ struct list_head busy_list;
+
+ /** Position in DAX window */
+ u64 window_offset;
+
+ /** Length of mapping, in bytes */
+ loff_t length;
+
+ /* Is this mapping read-only or read-write */
+ bool writable;
+
+ /* reference count when the mapping is used by dax iomap. */
+ refcount_t refcnt;
+};
+
+/* Per-inode dax map */
+struct fuse_inode_dax {
+ /* Semaphore to protect modifications to the dmap tree */
+ struct rw_semaphore sem;
+
+ /* Sorted rb tree of struct fuse_dax_mapping elements */
+ struct rb_root_cached tree;
+ unsigned long nr;
+};
+
+struct fuse_conn_dax {
+ /* DAX device */
+ struct dax_device *dev;
+
+ /* Lock protecting accessess to members of this structure */
+ spinlock_t lock;
+
+ /* List of memory ranges which are busy */
+ unsigned long nr_busy_ranges;
+ struct list_head busy_ranges;
+
+ /* Worker to free up memory ranges */
+ struct delayed_work free_work;
+
+ /* Wait queue for a dax range to become free */
+ wait_queue_head_t range_waitq;
+
+ /* DAX Window Free Ranges */
+ long nr_free_ranges;
+ struct list_head free_ranges;
+
+ unsigned long nr_ranges;
+};
+
+static inline struct fuse_dax_mapping *
+node_to_dmap(struct interval_tree_node *node)
+{
+ if (!node)
+ return NULL;
+
+ return container_of(node, struct fuse_dax_mapping, itn);
+}
+
+static struct fuse_dax_mapping *
+alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode);
+
+static void
+__kick_dmap_free_worker(struct fuse_conn_dax *fcd, unsigned long delay_ms)
+{
+ unsigned long free_threshold;
+
+ /* If number of free ranges are below threshold, start reclaim */
+ free_threshold = max_t(unsigned long, fcd->nr_ranges * FUSE_DAX_RECLAIM_THRESHOLD / 100,
+ 1);
+ if (fcd->nr_free_ranges < free_threshold)
+ queue_delayed_work(system_long_wq, &fcd->free_work,
+ msecs_to_jiffies(delay_ms));
+}
+
+static void kick_dmap_free_worker(struct fuse_conn_dax *fcd,
+ unsigned long delay_ms)
+{
+ spin_lock(&fcd->lock);
+ __kick_dmap_free_worker(fcd, delay_ms);
+ spin_unlock(&fcd->lock);
+}
+
+static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd)
+{
+ struct fuse_dax_mapping *dmap;
+
+ spin_lock(&fcd->lock);
+ dmap = list_first_entry_or_null(&fcd->free_ranges,
+ struct fuse_dax_mapping, list);
+ if (dmap) {
+ list_del_init(&dmap->list);
+ WARN_ON(fcd->nr_free_ranges <= 0);
+ fcd->nr_free_ranges--;
+ }
+ spin_unlock(&fcd->lock);
+
+ kick_dmap_free_worker(fcd, 0);
+ return dmap;
+}
+
+/* This assumes fcd->lock is held */
+static void __dmap_remove_busy_list(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ list_del_init(&dmap->busy_list);
+ WARN_ON(fcd->nr_busy_ranges == 0);
+ fcd->nr_busy_ranges--;
+}
+
+static void dmap_remove_busy_list(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ spin_lock(&fcd->lock);
+ __dmap_remove_busy_list(fcd, dmap);
+ spin_unlock(&fcd->lock);
+}
+
+/* This assumes fcd->lock is held */
+static void __dmap_add_to_free_pool(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ list_add_tail(&dmap->list, &fcd->free_ranges);
+ fcd->nr_free_ranges++;
+ wake_up(&fcd->range_waitq);
+}
+
+static void dmap_add_to_free_pool(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ /* Return fuse_dax_mapping to free list */
+ spin_lock(&fcd->lock);
+ __dmap_add_to_free_pool(fcd, dmap);
+ spin_unlock(&fcd->lock);
+}
+
+static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx,
+ struct fuse_dax_mapping *dmap, bool writable,
+ bool upgrade)
+{
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_conn_dax *fcd = fm->fc->dax;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_setupmapping_in inarg;
+ loff_t offset = start_idx << FUSE_DAX_SHIFT;
+ FUSE_ARGS(args);
+ ssize_t err;
+
+ WARN_ON(fcd->nr_free_ranges < 0);
+
+ /* Ask fuse daemon to setup mapping */
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.foffset = offset;
+ inarg.fh = -1;
+ inarg.moffset = dmap->window_offset;
+ inarg.len = FUSE_DAX_SZ;
+ inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ;
+ if (writable)
+ inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
+ args.opcode = FUSE_SETUPMAPPING;
+ args.nodeid = fi->nodeid;
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ err = fuse_simple_request(fm, &args);
+ if (err < 0)
+ return err;
+ dmap->writable = writable;
+ if (!upgrade) {
+ /*
+ * We don't take a refernce on inode. inode is valid right now
+ * and when inode is going away, cleanup logic should first
+ * cleanup dmap entries.
+ */
+ dmap->inode = inode;
+ dmap->itn.start = dmap->itn.last = start_idx;
+ /* Protected by fi->dax->sem */
+ interval_tree_insert(&dmap->itn, &fi->dax->tree);
+ fi->dax->nr++;
+ spin_lock(&fcd->lock);
+ list_add_tail(&dmap->busy_list, &fcd->busy_ranges);
+ fcd->nr_busy_ranges++;
+ spin_unlock(&fcd->lock);
+ }
+ return 0;
+}
+
+static int fuse_send_removemapping(struct inode *inode,
+ struct fuse_removemapping_in *inargp,
+ struct fuse_removemapping_one *remove_one)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ FUSE_ARGS(args);
+
+ args.opcode = FUSE_REMOVEMAPPING;
+ args.nodeid = fi->nodeid;
+ args.in_numargs = 2;
+ args.in_args[0].size = sizeof(*inargp);
+ args.in_args[0].value = inargp;
+ args.in_args[1].size = inargp->count * sizeof(*remove_one);
+ args.in_args[1].value = remove_one;
+ return fuse_simple_request(fm, &args);
+}
+
+static int dmap_removemapping_list(struct inode *inode, unsigned int num,
+ struct list_head *to_remove)
+{
+ struct fuse_removemapping_one *remove_one, *ptr;
+ struct fuse_removemapping_in inarg;
+ struct fuse_dax_mapping *dmap;
+ int ret, i = 0, nr_alloc;
+
+ nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY);
+ remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS);
+ if (!remove_one)
+ return -ENOMEM;
+
+ ptr = remove_one;
+ list_for_each_entry(dmap, to_remove, list) {
+ ptr->moffset = dmap->window_offset;
+ ptr->len = dmap->length;
+ ptr++;
+ i++;
+ num--;
+ if (i >= nr_alloc || num == 0) {
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.count = i;
+ ret = fuse_send_removemapping(inode, &inarg,
+ remove_one);
+ if (ret)
+ goto out;
+ ptr = remove_one;
+ i = 0;
+ }
+ }
+out:
+ kfree(remove_one);
+ return ret;
+}
+
+/*
+ * Cleanup dmap entry and add back to free list. This should be called with
+ * fcd->lock held.
+ */
+static void dmap_reinit_add_to_free_pool(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ pr_debug("fuse: freeing memory range start_idx=0x%lx end_idx=0x%lx window_offset=0x%llx length=0x%llx\n",
+ dmap->itn.start, dmap->itn.last, dmap->window_offset,
+ dmap->length);
+ __dmap_remove_busy_list(fcd, dmap);
+ dmap->inode = NULL;
+ dmap->itn.start = dmap->itn.last = 0;
+ __dmap_add_to_free_pool(fcd, dmap);
+}
+
+/*
+ * Free inode dmap entries whose range falls inside [start, end].
+ * Does not take any locks. At this point of time it should only be
+ * called from evict_inode() path where we know all dmap entries can be
+ * reclaimed.
+ */
+static void inode_reclaim_dmap_range(struct fuse_conn_dax *fcd,
+ struct inode *inode,
+ loff_t start, loff_t end)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap, *n;
+ int err, num = 0;
+ LIST_HEAD(to_remove);
+ unsigned long start_idx = start >> FUSE_DAX_SHIFT;
+ unsigned long end_idx = end >> FUSE_DAX_SHIFT;
+ struct interval_tree_node *node;
+
+ while (1) {
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx,
+ end_idx);
+ if (!node)
+ break;
+ dmap = node_to_dmap(node);
+ /* inode is going away. There should not be any users of dmap */
+ WARN_ON(refcount_read(&dmap->refcnt) > 1);
+ interval_tree_remove(&dmap->itn, &fi->dax->tree);
+ num++;
+ list_add(&dmap->list, &to_remove);
+ }
+
+ /* Nothing to remove */
+ if (list_empty(&to_remove))
+ return;
+
+ WARN_ON(fi->dax->nr < num);
+ fi->dax->nr -= num;
+ err = dmap_removemapping_list(inode, num, &to_remove);
+ if (err && err != -ENOTCONN) {
+ pr_warn("Failed to removemappings. start=0x%llx end=0x%llx\n",
+ start, end);
+ }
+ spin_lock(&fcd->lock);
+ list_for_each_entry_safe(dmap, n, &to_remove, list) {
+ list_del_init(&dmap->list);
+ dmap_reinit_add_to_free_pool(fcd, dmap);
+ }
+ spin_unlock(&fcd->lock);
+}
+
+static int dmap_removemapping_one(struct inode *inode,
+ struct fuse_dax_mapping *dmap)
+{
+ struct fuse_removemapping_one forget_one;
+ struct fuse_removemapping_in inarg;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.count = 1;
+ memset(&forget_one, 0, sizeof(forget_one));
+ forget_one.moffset = dmap->window_offset;
+ forget_one.len = dmap->length;
+
+ return fuse_send_removemapping(inode, &inarg, &forget_one);
+}
+
+/*
+ * It is called from evict_inode() and by that time inode is going away. So
+ * this function does not take any locks like fi->dax->sem for traversing
+ * that fuse inode interval tree. If that lock is taken then lock validator
+ * complains of deadlock situation w.r.t fs_reclaim lock.
+ */
+void fuse_dax_inode_cleanup(struct inode *inode)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ /*
+ * fuse_evict_inode() has already called truncate_inode_pages_final()
+ * before we arrive here. So we should not have to worry about any
+ * pages/exception entries still associated with inode.
+ */
+ inode_reclaim_dmap_range(fc->dax, inode, 0, -1);
+ WARN_ON(fi->dax->nr);
+}
+
+static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length)
+{
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->length = length;
+ iomap->type = IOMAP_HOLE;
+}
+
+static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length,
+ struct iomap *iomap, struct fuse_dax_mapping *dmap,
+ unsigned int flags)
+{
+ loff_t offset, len;
+ loff_t i_size = i_size_read(inode);
+
+ offset = pos - (dmap->itn.start << FUSE_DAX_SHIFT);
+ len = min(length, dmap->length - offset);
+
+ /* If length is beyond end of file, truncate further */
+ if (pos + len > i_size)
+ len = i_size - pos;
+
+ if (len > 0) {
+ iomap->addr = dmap->window_offset + offset;
+ iomap->length = len;
+ if (flags & IOMAP_FAULT)
+ iomap->length = ALIGN(len, PAGE_SIZE);
+ iomap->type = IOMAP_MAPPED;
+ /*
+ * increace refcnt so that reclaim code knows this dmap is in
+ * use. This assumes fi->dax->sem mutex is held either
+ * shared/exclusive.
+ */
+ refcount_inc(&dmap->refcnt);
+
+ /* iomap->private should be NULL */
+ WARN_ON_ONCE(iomap->private);
+ iomap->private = dmap;
+ } else {
+ /* Mapping beyond end of file is hole */
+ fuse_fill_iomap_hole(iomap, length);
+ }
+}
+
+static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_conn_dax *fcd = fc->dax;
+ struct fuse_dax_mapping *dmap, *alloc_dmap = NULL;
+ int ret;
+ bool writable = flags & IOMAP_WRITE;
+ unsigned long start_idx = pos >> FUSE_DAX_SHIFT;
+ struct interval_tree_node *node;
+
+ /*
+ * Can't do inline reclaim in fault path. We call
+ * dax_layout_busy_page() before we free a range. And
+ * fuse_wait_dax_page() drops fi->i_mmap_sem lock and requires it.
+ * In fault path we enter with fi->i_mmap_sem held and can't drop
+ * it. Also in fault path we hold fi->i_mmap_sem shared and not
+ * exclusive, so that creates further issues with fuse_wait_dax_page().
+ * Hence return -EAGAIN and fuse_dax_fault() will wait for a memory
+ * range to become free and retry.
+ */
+ if (flags & IOMAP_FAULT) {
+ alloc_dmap = alloc_dax_mapping(fcd);
+ if (!alloc_dmap)
+ return -EAGAIN;
+ } else {
+ alloc_dmap = alloc_dax_mapping_reclaim(fcd, inode);
+ if (IS_ERR(alloc_dmap))
+ return PTR_ERR(alloc_dmap);
+ }
+
+ /* If we are here, we should have memory allocated */
+ if (WARN_ON(!alloc_dmap))
+ return -EIO;
+
+ /*
+ * Take write lock so that only one caller can try to setup mapping
+ * and other waits.
+ */
+ down_write(&fi->dax->sem);
+ /*
+ * We dropped lock. Check again if somebody else setup
+ * mapping already.
+ */
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+ if (node) {
+ dmap = node_to_dmap(node);
+ fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+ dmap_add_to_free_pool(fcd, alloc_dmap);
+ up_write(&fi->dax->sem);
+ return 0;
+ }
+
+ /* Setup one mapping */
+ ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, alloc_dmap,
+ writable, false);
+ if (ret < 0) {
+ dmap_add_to_free_pool(fcd, alloc_dmap);
+ up_write(&fi->dax->sem);
+ return ret;
+ }
+ fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags);
+ up_write(&fi->dax->sem);
+ return 0;
+}
+
+static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap;
+ int ret;
+ unsigned long idx = pos >> FUSE_DAX_SHIFT;
+ struct interval_tree_node *node;
+
+ /*
+ * Take exclusive lock so that only one caller can try to setup
+ * mapping and others wait.
+ */
+ down_write(&fi->dax->sem);
+ node = interval_tree_iter_first(&fi->dax->tree, idx, idx);
+
+ /* We are holding either inode lock or i_mmap_sem, and that should
+ * ensure that dmap can't be truncated. We are holding a reference
+ * on dmap and that should make sure it can't be reclaimed. So dmap
+ * should still be there in tree despite the fact we dropped and
+ * re-acquired the fi->dax->sem lock.
+ */
+ ret = -EIO;
+ if (WARN_ON(!node))
+ goto out_err;
+
+ dmap = node_to_dmap(node);
+
+ /* We took an extra reference on dmap to make sure its not reclaimd.
+ * Now we hold fi->dax->sem lock and that reference is not needed
+ * anymore. Drop it.
+ */
+ if (refcount_dec_and_test(&dmap->refcnt)) {
+ /* refcount should not hit 0. This object only goes
+ * away when fuse connection goes away
+ */
+ WARN_ON_ONCE(1);
+ }
+
+ /* Maybe another thread already upgraded mapping while we were not
+ * holding lock.
+ */
+ if (dmap->writable) {
+ ret = 0;
+ goto out_fill_iomap;
+ }
+
+ ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, dmap, true,
+ true);
+ if (ret < 0)
+ goto out_err;
+out_fill_iomap:
+ fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+out_err:
+ up_write(&fi->dax->sem);
+ return ret;
+}
+
+/* This is just for DAX and the mapping is ephemeral, do not use it for other
+ * purposes since there is no block device with a permanent mapping.
+ */
+static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
+ unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_dax_mapping *dmap;
+ bool writable = flags & IOMAP_WRITE;
+ unsigned long start_idx = pos >> FUSE_DAX_SHIFT;
+ struct interval_tree_node *node;
+
+ /* We don't support FIEMAP */
+ if (WARN_ON(flags & IOMAP_REPORT))
+ return -EIO;
+
+ iomap->offset = pos;
+ iomap->flags = 0;
+ iomap->bdev = NULL;
+ iomap->dax_dev = fc->dax->dev;
+
+ /*
+ * Both read/write and mmap path can race here. So we need something
+ * to make sure if we are setting up mapping, then other path waits
+ *
+ * For now, use a semaphore for this. It probably needs to be
+ * optimized later.
+ */
+ down_read(&fi->dax->sem);
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+ if (node) {
+ dmap = node_to_dmap(node);
+ if (writable && !dmap->writable) {
+ /* Upgrade read-only mapping to read-write. This will
+ * require exclusive fi->dax->sem lock as we don't want
+ * two threads to be trying to this simultaneously
+ * for same dmap. So drop shared lock and acquire
+ * exclusive lock.
+ *
+ * Before dropping fi->dax->sem lock, take reference
+ * on dmap so that its not freed by range reclaim.
+ */
+ refcount_inc(&dmap->refcnt);
+ up_read(&fi->dax->sem);
+ pr_debug("%s: Upgrading mapping at offset 0x%llx length 0x%llx\n",
+ __func__, pos, length);
+ return fuse_upgrade_dax_mapping(inode, pos, length,
+ flags, iomap);
+ } else {
+ fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+ up_read(&fi->dax->sem);
+ return 0;
+ }
+ } else {
+ up_read(&fi->dax->sem);
+ pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n",
+ __func__, pos, length);
+ if (pos >= i_size_read(inode))
+ goto iomap_hole;
+
+ return fuse_setup_new_dax_mapping(inode, pos, length, flags,
+ iomap);
+ }
+
+ /*
+ * If read beyond end of file happnes, fs code seems to return
+ * it as hole
+ */
+iomap_hole:
+ fuse_fill_iomap_hole(iomap, length);
+ pr_debug("%s returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n",
+ __func__, pos, length, iomap->length);
+ return 0;
+}
+
+static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned int flags,
+ struct iomap *iomap)
+{
+ struct fuse_dax_mapping *dmap = iomap->private;
+
+ if (dmap) {
+ if (refcount_dec_and_test(&dmap->refcnt)) {
+ /* refcount should not hit 0. This object only goes
+ * away when fuse connection goes away
+ */
+ WARN_ON_ONCE(1);
+ }
+ }
+
+ /* DAX writes beyond end-of-file aren't handled using iomap, so the
+ * file size is unchanged and there is nothing to do here.
+ */
+ return 0;
+}
+
+static const struct iomap_ops fuse_iomap_ops = {
+ .iomap_begin = fuse_iomap_begin,
+ .iomap_end = fuse_iomap_end,
+};
+
+static void fuse_wait_dax_page(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ up_write(&fi->i_mmap_sem);
+ schedule();
+ down_write(&fi->i_mmap_sem);
+}
+
+/* Should be called with fi->i_mmap_sem lock held exclusively */
+static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
+ loff_t start, loff_t end)
+{
+ struct page *page;
+
+ page = dax_layout_busy_page_range(inode->i_mapping, start, end);
+ if (!page)
+ return 0;
+
+ *retry = true;
+ return ___wait_var_event(&page->_refcount,
+ atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
+ 0, 0, fuse_wait_dax_page(inode));
+}
+
+/* dmap_end == 0 leads to unmapping of whole file */
+int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start,
+ u64 dmap_end)
+{
+ bool retry;
+ int ret;
+
+ do {
+ retry = false;
+ ret = __fuse_dax_break_layouts(inode, &retry, dmap_start,
+ dmap_end);
+ } while (ret == 0 && retry);
+
+ return ret;
+}
+
+ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock_shared(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock_shared(inode);
+ }
+
+ ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops);
+ inode_unlock_shared(inode);
+
+ /* TODO file_accessed(iocb->f_filp) */
+ return ret;
+}
+
+static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ return (iov_iter_rw(from) == WRITE &&
+ ((iocb->ki_pos) >= i_size_read(inode) ||
+ (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode))));
+}
+
+static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
+ ssize_t ret;
+
+ ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
+ if (ret < 0)
+ return ret;
+
+ fuse_invalidate_attr(inode);
+ fuse_write_update_size(inode, iocb->ki_pos);
+ return ret;
+}
+
+ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock(inode);
+ }
+
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
+
+ ret = file_remove_privs(iocb->ki_filp);
+ if (ret)
+ goto out;
+ /* TODO file_update_time() but we don't want metadata I/O */
+
+ /* Do not use dax for file extending writes as write and on
+ * disk i_size increase are not atomic otherwise.
+ */
+ if (file_extending_write(iocb, from))
+ ret = fuse_dax_direct_write(iocb, from);
+ else
+ ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops);
+
+out:
+ inode_unlock(inode);
+
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ return ret;
+}
+
+static int fuse_dax_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+
+ struct inode *inode = mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc);
+}
+
+static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf,
+ enum page_entry_size pe_size, bool write)
+{
+ vm_fault_t ret;
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ struct super_block *sb = inode->i_sb;
+ pfn_t pfn;
+ int error = 0;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_conn_dax *fcd = fc->dax;
+ bool retry = false;
+
+ if (write)
+ sb_start_pagefault(sb);
+retry:
+ if (retry && !(fcd->nr_free_ranges > 0))
+ wait_event(fcd->range_waitq, (fcd->nr_free_ranges > 0));
+
+ /*
+ * We need to serialize against not only truncate but also against
+ * fuse dax memory range reclaim. While a range is being reclaimed,
+ * we do not want any read/write/mmap to make progress and try
+ * to populate page cache or access memory we are trying to free.
+ */
+ down_read(&get_fuse_inode(inode)->i_mmap_sem);
+ ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops);
+ if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) {
+ error = 0;
+ retry = true;
+ up_read(&get_fuse_inode(inode)->i_mmap_sem);
+ goto retry;
+ }
+
+ if (ret & VM_FAULT_NEEDDSYNC)
+ ret = dax_finish_sync_fault(vmf, pe_size, pfn);
+ up_read(&get_fuse_inode(inode)->i_mmap_sem);
+
+ if (write)
+ sb_end_pagefault(sb);
+
+ return ret;
+}
+
+static vm_fault_t fuse_dax_fault(struct vm_fault *vmf)
+{
+ return __fuse_dax_fault(vmf, PE_SIZE_PTE,
+ vmf->flags & FAULT_FLAG_WRITE);
+}
+
+static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf,
+ enum page_entry_size pe_size)
+{
+ return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE);
+}
+
+static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf)
+{
+ return __fuse_dax_fault(vmf, PE_SIZE_PTE, true);
+}
+
+static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf)
+{
+ return __fuse_dax_fault(vmf, PE_SIZE_PTE, true);
+}
+
+static const struct vm_operations_struct fuse_dax_vm_ops = {
+ .fault = fuse_dax_fault,
+ .huge_fault = fuse_dax_huge_fault,
+ .page_mkwrite = fuse_dax_page_mkwrite,
+ .pfn_mkwrite = fuse_dax_pfn_mkwrite,
+};
+
+int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ file_accessed(file);
+ vma->vm_ops = &fuse_dax_vm_ops;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ return 0;
+}
+
+static int dmap_writeback_invalidate(struct inode *inode,
+ struct fuse_dax_mapping *dmap)
+{
+ int ret;
+ loff_t start_pos = dmap->itn.start << FUSE_DAX_SHIFT;
+ loff_t end_pos = (start_pos + FUSE_DAX_SZ - 1);
+
+ ret = filemap_fdatawrite_range(inode->i_mapping, start_pos, end_pos);
+ if (ret) {
+ pr_debug("fuse: filemap_fdatawrite_range() failed. err=%d start_pos=0x%llx, end_pos=0x%llx\n",
+ ret, start_pos, end_pos);
+ return ret;
+ }
+
+ ret = invalidate_inode_pages2_range(inode->i_mapping,
+ start_pos >> PAGE_SHIFT,
+ end_pos >> PAGE_SHIFT);
+ if (ret)
+ pr_debug("fuse: invalidate_inode_pages2_range() failed err=%d\n",
+ ret);
+
+ return ret;
+}
+
+static int reclaim_one_dmap_locked(struct inode *inode,
+ struct fuse_dax_mapping *dmap)
+{
+ int ret;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ /*
+ * igrab() was done to make sure inode won't go under us, and this
+ * further avoids the race with evict().
+ */
+ ret = dmap_writeback_invalidate(inode, dmap);
+ if (ret)
+ return ret;
+
+ /* Remove dax mapping from inode interval tree now */
+ interval_tree_remove(&dmap->itn, &fi->dax->tree);
+ fi->dax->nr--;
+
+ /* It is possible that umount/shutdown has killed the fuse connection
+ * and worker thread is trying to reclaim memory in parallel. Don't
+ * warn in that case.
+ */
+ ret = dmap_removemapping_one(inode, dmap);
+ if (ret && ret != -ENOTCONN) {
+ pr_warn("Failed to remove mapping. offset=0x%llx len=0x%llx ret=%d\n",
+ dmap->window_offset, dmap->length, ret);
+ }
+ return 0;
+}
+
+/* Find first mapped dmap for an inode and return file offset. Caller needs
+ * to hold fi->dax->sem lock either shared or exclusive.
+ */
+static struct fuse_dax_mapping *inode_lookup_first_dmap(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap;
+ struct interval_tree_node *node;
+
+ for (node = interval_tree_iter_first(&fi->dax->tree, 0, -1); node;
+ node = interval_tree_iter_next(node, 0, -1)) {
+ dmap = node_to_dmap(node);
+ /* still in use. */
+ if (refcount_read(&dmap->refcnt) > 1)
+ continue;
+
+ return dmap;
+ }
+
+ return NULL;
+}
+
+/*
+ * Find first mapping in the tree and free it and return it. Do not add
+ * it back to free pool.
+ */
+static struct fuse_dax_mapping *
+inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode,
+ bool *retry)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap;
+ u64 dmap_start, dmap_end;
+ unsigned long start_idx;
+ int ret;
+ struct interval_tree_node *node;
+
+ down_write(&fi->i_mmap_sem);
+
+ /* Lookup a dmap and corresponding file offset to reclaim. */
+ down_read(&fi->dax->sem);
+ dmap = inode_lookup_first_dmap(inode);
+ if (dmap) {
+ start_idx = dmap->itn.start;
+ dmap_start = start_idx << FUSE_DAX_SHIFT;
+ dmap_end = dmap_start + FUSE_DAX_SZ - 1;
+ }
+ up_read(&fi->dax->sem);
+
+ if (!dmap)
+ goto out_mmap_sem;
+ /*
+ * Make sure there are no references to inode pages using
+ * get_user_pages()
+ */
+ ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
+ if (ret) {
+ pr_debug("fuse: fuse_dax_break_layouts() failed. err=%d\n",
+ ret);
+ dmap = ERR_PTR(ret);
+ goto out_mmap_sem;
+ }
+
+ down_write(&fi->dax->sem);
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+ /* Range already got reclaimed by somebody else */
+ if (!node) {
+ if (retry)
+ *retry = true;
+ goto out_write_dmap_sem;
+ }
+
+ dmap = node_to_dmap(node);
+ /* still in use. */
+ if (refcount_read(&dmap->refcnt) > 1) {
+ dmap = NULL;
+ if (retry)
+ *retry = true;
+ goto out_write_dmap_sem;
+ }
+
+ ret = reclaim_one_dmap_locked(inode, dmap);
+ if (ret < 0) {
+ dmap = ERR_PTR(ret);
+ goto out_write_dmap_sem;
+ }
+
+ /* Clean up dmap. Do not add back to free list */
+ dmap_remove_busy_list(fcd, dmap);
+ dmap->inode = NULL;
+ dmap->itn.start = dmap->itn.last = 0;
+
+ pr_debug("fuse: %s: inline reclaimed memory range. inode=%p, window_offset=0x%llx, length=0x%llx\n",
+ __func__, inode, dmap->window_offset, dmap->length);
+
+out_write_dmap_sem:
+ up_write(&fi->dax->sem);
+out_mmap_sem:
+ up_write(&fi->i_mmap_sem);
+ return dmap;
+}
+
+static struct fuse_dax_mapping *
+alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode)
+{
+ struct fuse_dax_mapping *dmap;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ while (1) {
+ bool retry = false;
+
+ dmap = alloc_dax_mapping(fcd);
+ if (dmap)
+ return dmap;
+
+ dmap = inode_inline_reclaim_one_dmap(fcd, inode, &retry);
+ /*
+ * Either we got a mapping or it is an error, return in both
+ * the cases.
+ */
+ if (dmap)
+ return dmap;
+
+ /* If we could not reclaim a mapping because it
+ * had a reference or some other temporary failure,
+ * Try again. We want to give up inline reclaim only
+ * if there is no range assigned to this node. Otherwise
+ * if a deadlock is possible if we sleep with fi->i_mmap_sem
+ * held and worker to free memory can't make progress due
+ * to unavailability of fi->i_mmap_sem lock. So sleep
+ * only if fi->dax->nr=0
+ */
+ if (retry)
+ continue;
+ /*
+ * There are no mappings which can be reclaimed. Wait for one.
+ * We are not holding fi->dax->sem. So it is possible
+ * that range gets added now. But as we are not holding
+ * fi->i_mmap_sem, worker should still be able to free up
+ * a range and wake us up.
+ */
+ if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) {
+ if (wait_event_killable_exclusive(fcd->range_waitq,
+ (fcd->nr_free_ranges > 0))) {
+ return ERR_PTR(-EINTR);
+ }
+ }
+ }
+}
+
+static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd,
+ struct inode *inode,
+ unsigned long start_idx)
+{
+ int ret;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap;
+ struct interval_tree_node *node;
+
+ /* Find fuse dax mapping at file offset inode. */
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+
+ /* Range already got cleaned up by somebody else */
+ if (!node)
+ return 0;
+ dmap = node_to_dmap(node);
+
+ /* still in use. */
+ if (refcount_read(&dmap->refcnt) > 1)
+ return 0;
+
+ ret = reclaim_one_dmap_locked(inode, dmap);
+ if (ret < 0)
+ return ret;
+
+ /* Cleanup dmap entry and add back to free list */
+ spin_lock(&fcd->lock);
+ dmap_reinit_add_to_free_pool(fcd, dmap);
+ spin_unlock(&fcd->lock);
+ return ret;
+}
+
+/*
+ * Free a range of memory.
+ * Locking:
+ * 1. Take fi->i_mmap_sem to block dax faults.
+ * 2. Take fi->dax->sem to protect interval tree and also to make sure
+ * read/write can not reuse a dmap which we might be freeing.
+ */
+static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd,
+ struct inode *inode,
+ unsigned long start_idx,
+ unsigned long end_idx)
+{
+ int ret;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ loff_t dmap_start = start_idx << FUSE_DAX_SHIFT;
+ loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1;
+
+ down_write(&fi->i_mmap_sem);
+ ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
+ if (ret) {
+ pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n",
+ ret);
+ goto out_mmap_sem;
+ }
+
+ down_write(&fi->dax->sem);
+ ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx);
+ up_write(&fi->dax->sem);
+out_mmap_sem:
+ up_write(&fi->i_mmap_sem);
+ return ret;
+}
+
+static int try_to_free_dmap_chunks(struct fuse_conn_dax *fcd,
+ unsigned long nr_to_free)
+{
+ struct fuse_dax_mapping *dmap, *pos, *temp;
+ int ret, nr_freed = 0;
+ unsigned long start_idx = 0, end_idx = 0;
+ struct inode *inode = NULL;
+
+ /* Pick first busy range and free it for now*/
+ while (1) {
+ if (nr_freed >= nr_to_free)
+ break;
+
+ dmap = NULL;
+ spin_lock(&fcd->lock);
+
+ if (!fcd->nr_busy_ranges) {
+ spin_unlock(&fcd->lock);
+ return 0;
+ }
+
+ list_for_each_entry_safe(pos, temp, &fcd->busy_ranges,
+ busy_list) {
+ /* skip this range if it's in use. */
+ if (refcount_read(&pos->refcnt) > 1)
+ continue;
+
+ inode = igrab(pos->inode);
+ /*
+ * This inode is going away. That will free
+ * up all the ranges anyway, continue to
+ * next range.
+ */
+ if (!inode)
+ continue;
+ /*
+ * Take this element off list and add it tail. If
+ * this element can't be freed, it will help with
+ * selecting new element in next iteration of loop.
+ */
+ dmap = pos;
+ list_move_tail(&dmap->busy_list, &fcd->busy_ranges);
+ start_idx = end_idx = dmap->itn.start;
+ break;
+ }
+ spin_unlock(&fcd->lock);
+ if (!dmap)
+ return 0;
+
+ ret = lookup_and_reclaim_dmap(fcd, inode, start_idx, end_idx);
+ iput(inode);
+ if (ret)
+ return ret;
+ nr_freed++;
+ }
+ return 0;
+}
+
+static void fuse_dax_free_mem_worker(struct work_struct *work)
+{
+ int ret;
+ struct fuse_conn_dax *fcd = container_of(work, struct fuse_conn_dax,
+ free_work.work);
+ ret = try_to_free_dmap_chunks(fcd, FUSE_DAX_RECLAIM_CHUNK);
+ if (ret) {
+ pr_debug("fuse: try_to_free_dmap_chunks() failed with err=%d\n",
+ ret);
+ }
+
+ /* If number of free ranges are still below threhold, requeue */
+ kick_dmap_free_worker(fcd, 1);
+}
+
+static void fuse_free_dax_mem_ranges(struct list_head *mem_list)
+{
+ struct fuse_dax_mapping *range, *temp;
+
+ /* Free All allocated elements */
+ list_for_each_entry_safe(range, temp, mem_list, list) {
+ list_del(&range->list);
+ if (!list_empty(&range->busy_list))
+ list_del(&range->busy_list);
+ kfree(range);
+ }
+}
+
+void fuse_dax_conn_free(struct fuse_conn *fc)
+{
+ if (fc->dax) {
+ fuse_free_dax_mem_ranges(&fc->dax->free_ranges);
+ kfree(fc->dax);
+ }
+}
+
+static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd)
+{
+ long nr_pages, nr_ranges;
+ void *kaddr;
+ pfn_t pfn;
+ struct fuse_dax_mapping *range;
+ int ret, id;
+ size_t dax_size = -1;
+ unsigned long i;
+
+ init_waitqueue_head(&fcd->range_waitq);
+ INIT_LIST_HEAD(&fcd->free_ranges);
+ INIT_LIST_HEAD(&fcd->busy_ranges);
+ INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker);
+
+ id = dax_read_lock();
+ nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), &kaddr,
+ &pfn);
+ dax_read_unlock(id);
+ if (nr_pages < 0) {
+ pr_debug("dax_direct_access() returned %ld\n", nr_pages);
+ return nr_pages;
+ }
+
+ nr_ranges = nr_pages/FUSE_DAX_PAGES;
+ pr_debug("%s: dax mapped %ld pages. nr_ranges=%ld\n",
+ __func__, nr_pages, nr_ranges);
+
+ for (i = 0; i < nr_ranges; i++) {
+ range = kzalloc(sizeof(struct fuse_dax_mapping), GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!range)
+ goto out_err;
+
+ /* TODO: This offset only works if virtio-fs driver is not
+ * having some memory hidden at the beginning. This needs
+ * better handling
+ */
+ range->window_offset = i * FUSE_DAX_SZ;
+ range->length = FUSE_DAX_SZ;
+ INIT_LIST_HEAD(&range->busy_list);
+ refcount_set(&range->refcnt, 1);
+ list_add_tail(&range->list, &fcd->free_ranges);
+ }
+
+ fcd->nr_free_ranges = nr_ranges;
+ fcd->nr_ranges = nr_ranges;
+ return 0;
+out_err:
+ /* Free All allocated elements */
+ fuse_free_dax_mem_ranges(&fcd->free_ranges);
+ return ret;
+}
+
+int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev)
+{
+ struct fuse_conn_dax *fcd;
+ int err;
+
+ if (!dax_dev)
+ return 0;
+
+ fcd = kzalloc(sizeof(*fcd), GFP_KERNEL);
+ if (!fcd)
+ return -ENOMEM;
+
+ spin_lock_init(&fcd->lock);
+ fcd->dev = dax_dev;
+ err = fuse_dax_mem_range_init(fcd);
+ if (err) {
+ kfree(fcd);
+ return err;
+ }
+
+ fc->dax = fcd;
+ return 0;
+}
+
+bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi)
+{
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+ fi->dax = NULL;
+ if (fc->dax) {
+ fi->dax = kzalloc(sizeof(*fi->dax), GFP_KERNEL_ACCOUNT);
+ if (!fi->dax)
+ return false;
+
+ init_rwsem(&fi->dax->sem);
+ fi->dax->tree = RB_ROOT_CACHED;
+ }
+
+ return true;
+}
+
+static const struct address_space_operations fuse_dax_file_aops = {
+ .writepages = fuse_dax_writepages,
+ .direct_IO = noop_direct_IO,
+ .set_page_dirty = noop_set_page_dirty,
+ .invalidatepage = noop_invalidatepage,
+};
+
+void fuse_dax_inode_init(struct inode *inode)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (!fc->dax)
+ return;
+
+ inode->i_flags |= S_DAX;
+ inode->i_data.a_ops = &fuse_dax_file_aops;
+}
+
+bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment)
+{
+ if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) {
+ pr_warn("FUSE: map_alignment %u incompatible with dax mem range size %u\n",
+ map_alignment, FUSE_DAX_SZ);
+ return false;
+ }
+ return true;
+}
+
+void fuse_dax_cancel_work(struct fuse_conn *fc)
+{
+ struct fuse_conn_dax *fcd = fc->dax;
+
+ if (fcd)
+ cancel_delayed_work_sync(&fcd->free_work);
+
+}
+EXPORT_SYMBOL_GPL(fuse_dax_cancel_work);
return READ_ONCE(file->private_data);
}
-static void fuse_request_init(struct fuse_req *req)
+static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
{
INIT_LIST_HEAD(&req->list);
INIT_LIST_HEAD(&req->intr_entry);
init_waitqueue_head(&req->waitq);
refcount_set(&req->count, 1);
__set_bit(FR_PENDING, &req->flags);
+ req->fm = fm;
}
-static struct fuse_req *fuse_request_alloc(gfp_t flags)
+static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags)
{
struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags);
if (req)
- fuse_request_init(req);
+ fuse_request_init(fm, req);
return req;
}
}
}
-static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
+static void fuse_put_request(struct fuse_req *req);
-static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background)
+static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background)
{
+ struct fuse_conn *fc = fm->fc;
struct fuse_req *req;
int err;
atomic_inc(&fc->num_waiting);
if (fc->conn_error)
goto out;
- req = fuse_request_alloc(GFP_KERNEL);
+ req = fuse_request_alloc(fm, GFP_KERNEL);
err = -ENOMEM;
if (!req) {
if (for_background)
if (unlikely(req->in.h.uid == ((uid_t)-1) ||
req->in.h.gid == ((gid_t)-1))) {
- fuse_put_request(fc, req);
+ fuse_put_request(req);
return ERR_PTR(-EOVERFLOW);
}
return req;
return ERR_PTR(err);
}
-static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_put_request(struct fuse_req *req)
{
+ struct fuse_conn *fc = req->fm->fc;
+
if (refcount_dec_and_test(&req->count)) {
if (test_bit(FR_BACKGROUND, &req->flags)) {
/*
* the 'end' callback is called if given, else the reference to the
* request is released
*/
-void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_end(struct fuse_req *req)
{
+ struct fuse_mount *fm = req->fm;
+ struct fuse_conn *fc = fm->fc;
struct fuse_iqueue *fiq = &fc->iq;
if (test_and_set_bit(FR_FINISHED, &req->flags))
wake_up(&fc->blocked_waitq);
}
- if (fc->num_background == fc->congestion_threshold && fc->sb) {
- clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
- clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+ if (fc->num_background == fc->congestion_threshold && fm->sb) {
+ clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
+ clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
}
fc->num_background--;
fc->active_background--;
}
if (test_bit(FR_ASYNC, &req->flags))
- req->args->end(fc, req->args, req->out.h.error);
+ req->args->end(fm, req->args, req->out.h.error);
put_request:
- fuse_put_request(fc, req);
+ fuse_put_request(req);
}
EXPORT_SYMBOL_GPL(fuse_request_end);
-static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
+static int queue_interrupt(struct fuse_req *req)
{
+ struct fuse_iqueue *fiq = &req->fm->fc->iq;
+
spin_lock(&fiq->lock);
/* Check for we've sent request to interrupt this req */
if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) {
return 0;
}
-static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
+static void request_wait_answer(struct fuse_req *req)
{
+ struct fuse_conn *fc = req->fm->fc;
struct fuse_iqueue *fiq = &fc->iq;
int err;
/* matches barrier in fuse_dev_do_read() */
smp_mb__after_atomic();
if (test_bit(FR_SENT, &req->flags))
- queue_interrupt(fiq, req);
+ queue_interrupt(req);
}
if (!test_bit(FR_FORCE, &req->flags)) {
wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags));
}
-static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+static void __fuse_request_send(struct fuse_req *req)
{
- struct fuse_iqueue *fiq = &fc->iq;
+ struct fuse_iqueue *fiq = &req->fm->fc->iq;
BUG_ON(test_bit(FR_BACKGROUND, &req->flags));
spin_lock(&fiq->lock);
__fuse_get_request(req);
queue_request_and_unlock(fiq, req);
- request_wait_answer(fc, req);
+ request_wait_answer(req);
/* Pairs with smp_wmb() in fuse_request_end() */
smp_rmb();
}
}
}
-static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_force_creds(struct fuse_req *req)
{
+ struct fuse_conn *fc = req->fm->fc;
+
req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid());
req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid());
req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
__set_bit(FR_ASYNC, &req->flags);
}
-ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
+ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args)
{
+ struct fuse_conn *fc = fm->fc;
struct fuse_req *req;
ssize_t ret;
if (args->force) {
atomic_inc(&fc->num_waiting);
- req = fuse_request_alloc(GFP_KERNEL | __GFP_NOFAIL);
+ req = fuse_request_alloc(fm, GFP_KERNEL | __GFP_NOFAIL);
if (!args->nocreds)
- fuse_force_creds(fc, req);
+ fuse_force_creds(req);
__set_bit(FR_WAITING, &req->flags);
__set_bit(FR_FORCE, &req->flags);
} else {
WARN_ON(args->nocreds);
- req = fuse_get_req(fc, false);
+ req = fuse_get_req(fm, false);
if (IS_ERR(req))
return PTR_ERR(req);
}
if (!args->noreply)
__set_bit(FR_ISREPLY, &req->flags);
- __fuse_request_send(fc, req);
+ __fuse_request_send(req);
ret = req->out.h.error;
if (!ret && args->out_argvar) {
BUG_ON(args->out_numargs == 0);
ret = args->out_args[args->out_numargs - 1].size;
}
- fuse_put_request(fc, req);
+ fuse_put_request(req);
return ret;
}
-static bool fuse_request_queue_background(struct fuse_conn *fc,
- struct fuse_req *req)
+static bool fuse_request_queue_background(struct fuse_req *req)
{
+ struct fuse_mount *fm = req->fm;
+ struct fuse_conn *fc = fm->fc;
bool queued = false;
WARN_ON(!test_bit(FR_BACKGROUND, &req->flags));
fc->num_background++;
if (fc->num_background == fc->max_background)
fc->blocked = 1;
- if (fc->num_background == fc->congestion_threshold && fc->sb) {
- set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
- set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+ if (fc->num_background == fc->congestion_threshold && fm->sb) {
+ set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
+ set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
}
list_add_tail(&req->list, &fc->bg_queue);
flush_bg_queue(fc);
return queued;
}
-int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args,
+int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
gfp_t gfp_flags)
{
struct fuse_req *req;
if (args->force) {
WARN_ON(!args->nocreds);
- req = fuse_request_alloc(gfp_flags);
+ req = fuse_request_alloc(fm, gfp_flags);
if (!req)
return -ENOMEM;
__set_bit(FR_BACKGROUND, &req->flags);
} else {
WARN_ON(args->nocreds);
- req = fuse_get_req(fc, true);
+ req = fuse_get_req(fm, true);
if (IS_ERR(req))
return PTR_ERR(req);
}
fuse_args_to_req(req, args);
- if (!fuse_request_queue_background(fc, req)) {
- fuse_put_request(fc, req);
+ if (!fuse_request_queue_background(req)) {
+ fuse_put_request(req);
return -ENOTCONN;
}
}
EXPORT_SYMBOL_GPL(fuse_simple_background);
-static int fuse_simple_notify_reply(struct fuse_conn *fc,
+static int fuse_simple_notify_reply(struct fuse_mount *fm,
struct fuse_args *args, u64 unique)
{
struct fuse_req *req;
- struct fuse_iqueue *fiq = &fc->iq;
+ struct fuse_iqueue *fiq = &fm->fc->iq;
int err = 0;
- req = fuse_get_req(fc, false);
+ req = fuse_get_req(fm, false);
if (IS_ERR(req))
return PTR_ERR(req);
} else {
err = -ENODEV;
spin_unlock(&fiq->lock);
- fuse_put_request(fc, req);
+ fuse_put_request(req);
}
return err;
struct page *newpage;
struct pipe_buffer *buf = cs->pipebufs;
+ get_page(oldpage);
err = unlock_request(cs->req);
if (err)
- return err;
+ goto out_put_old;
fuse_copy_finish(cs);
err = pipe_buf_confirm(cs->pipe, buf);
if (err)
- return err;
+ goto out_put_old;
BUG_ON(!cs->nr_segs);
cs->currbuf = buf;
err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL);
if (err) {
unlock_page(newpage);
- return err;
+ goto out_put_old;
}
get_page(newpage);
if (err) {
unlock_page(newpage);
put_page(newpage);
- return err;
+ goto out_put_old;
}
unlock_page(oldpage);
+ /* Drop ref for ap->pages[] array */
put_page(oldpage);
cs->len = 0;
- return 0;
+ err = 0;
+out_put_old:
+ /* Drop ref obtained in this function */
+ put_page(oldpage);
+ return err;
out_fallback_unlock:
unlock_page(newpage);
cs->offset = buf->offset;
err = lock_request(cs->req);
- if (err)
- return err;
+ if (!err)
+ err = 1;
- return 1;
+ goto out_put_old;
}
static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
if (cs->nr_segs >= cs->pipe->max_usage)
return -EIO;
+ get_page(page);
err = unlock_request(cs->req);
- if (err)
+ if (err) {
+ put_page(page);
return err;
+ }
fuse_copy_finish(cs);
buf = cs->pipebufs;
- get_page(page);
buf->page = page;
buf->offset = offset;
buf->len = count;
/* SETXATTR is special, since it may contain too large data */
if (args->opcode == FUSE_SETXATTR)
req->out.h.error = -E2BIG;
- fuse_request_end(fc, req);
+ fuse_request_end(req);
goto restart;
}
spin_lock(&fpq->lock);
/* matches barrier in request_wait_answer() */
smp_mb__after_atomic();
if (test_bit(FR_INTERRUPTED, &req->flags))
- queue_interrupt(fiq, req);
- fuse_put_request(fc, req);
+ queue_interrupt(req);
+ fuse_put_request(req);
return reqsize;
if (!test_bit(FR_PRIVATE, &req->flags))
list_del_init(&req->list);
spin_unlock(&fpq->lock);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
return err;
err_unlock:
fuse_copy_finish(cs);
down_read(&fc->killsb);
- err = -ENOENT;
- if (fc->sb) {
- err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
- outarg.off, outarg.len);
- }
+ err = fuse_reverse_inval_inode(fc, outarg.ino,
+ outarg.off, outarg.len);
up_read(&fc->killsb);
return err;
buf[outarg.namelen] = 0;
down_read(&fc->killsb);
- err = -ENOENT;
- if (fc->sb)
- err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name);
+ err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name);
up_read(&fc->killsb);
kfree(buf);
return err;
buf[outarg.namelen] = 0;
down_read(&fc->killsb);
- err = -ENOENT;
- if (fc->sb)
- err = fuse_reverse_inval_entry(fc->sb, outarg.parent,
- outarg.child, &name);
+ err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name);
up_read(&fc->killsb);
kfree(buf);
return err;
down_read(&fc->killsb);
err = -ENOENT;
- if (!fc->sb)
- goto out_up_killsb;
-
- inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+ inode = fuse_ilookup(fc, nodeid, NULL);
if (!inode)
goto out_up_killsb;
struct fuse_notify_retrieve_in inarg;
};
-static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_retrieve_end(struct fuse_mount *fm, struct fuse_args *args,
int error)
{
struct fuse_retrieve_args *ra =
kfree(ra);
}
-static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
+static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
struct fuse_notify_retrieve_out *outarg)
{
int err;
unsigned int offset;
size_t total_len = 0;
unsigned int num_pages;
+ struct fuse_conn *fc = fm->fc;
struct fuse_retrieve_args *ra;
size_t args_size = sizeof(*ra);
struct fuse_args_pages *ap;
args->in_args[0].value = &ra->inarg;
args->in_args[1].size = total_len;
- err = fuse_simple_notify_reply(fc, args, outarg->notify_unique);
+ err = fuse_simple_notify_reply(fm, args, outarg->notify_unique);
if (err)
- fuse_retrieve_end(fc, args, err);
+ fuse_retrieve_end(fm, args, err);
return err;
}
struct fuse_copy_state *cs)
{
struct fuse_notify_retrieve_out outarg;
+ struct fuse_mount *fm;
struct inode *inode;
+ u64 nodeid;
int err;
err = -EINVAL;
down_read(&fc->killsb);
err = -ENOENT;
- if (fc->sb) {
- u64 nodeid = outarg.nodeid;
+ nodeid = outarg.nodeid;
- inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
- if (inode) {
- err = fuse_retrieve(fc, inode, &outarg);
- iput(inode);
- }
+ inode = fuse_ilookup(fc, nodeid, &fm);
+ if (inode) {
+ err = fuse_retrieve(fm, inode, &outarg);
+ iput(inode);
}
up_read(&fc->killsb);
else if (oh.error == -ENOSYS)
fc->no_interrupt = 1;
else if (oh.error == -EAGAIN)
- err = queue_interrupt(&fc->iq, req);
+ err = queue_interrupt(req);
- fuse_put_request(fc, req);
+ fuse_put_request(req);
goto copy_finish;
}
list_del_init(&req->list);
spin_unlock(&fpq->lock);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
out:
return err ? err : nbytes;
}
/* Abort all requests on the given list (pending or processing) */
-static void end_requests(struct fuse_conn *fc, struct list_head *head)
+static void end_requests(struct list_head *head)
{
while (!list_empty(head)) {
struct fuse_req *req;
req->out.h.error = -ECONNABORTED;
clear_bit(FR_SENT, &req->flags);
list_del_init(&req->list);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
}
}
wake_up_all(&fc->blocked_waitq);
spin_unlock(&fc->lock);
- end_requests(fc, &to_end);
+ end_requests(&to_end);
} else {
spin_unlock(&fc->lock);
}
list_splice_init(&fpq->processing[i], &to_end);
spin_unlock(&fpq->lock);
- end_requests(fc, &to_end);
+ end_requests(&to_end);
/* Are we the last open device? */
if (atomic_dec_and_test(&fc->dev_count)) {
#include <linux/pagemap.h>
#include <linux/file.h>
+#include <linux/fs_context.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/slab.h>
{
struct inode *inode;
struct dentry *parent;
- struct fuse_conn *fc;
+ struct fuse_mount *fm;
struct fuse_inode *fi;
int ret;
if (flags & LOOKUP_RCU)
goto out;
- fc = get_fuse_conn(inode);
+ fm = get_fuse_mount(inode);
forget = fuse_alloc_forget();
ret = -ENOMEM;
if (!forget)
goto out;
- attr_version = fuse_get_attr_version(fc);
+ attr_version = fuse_get_attr_version(fm->fc);
parent = dget_parent(entry);
- fuse_lookup_init(fc, &args, get_node_id(d_inode(parent)),
+ fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)),
&entry->d_name, &outarg);
- ret = fuse_simple_request(fc, &args);
+ ret = fuse_simple_request(fm, &args);
dput(parent);
/* Zero nodeid is same as -ENOENT */
if (!ret && !outarg.nodeid)
ret = -ENOENT;
if (!ret) {
fi = get_fuse_inode(inode);
- if (outarg.nodeid != get_node_id(inode)) {
- fuse_queue_forget(fc, forget, outarg.nodeid, 1);
+ if (outarg.nodeid != get_node_id(inode) ||
+ (bool) IS_AUTOMOUNT(inode) != (bool) (outarg.attr.flags & FUSE_ATTR_SUBMOUNT)) {
+ fuse_queue_forget(fm->fc, forget,
+ outarg.nodeid, 1);
goto invalid;
}
spin_lock(&fi->lock);
return time_before64(fuse_dentry_time(dentry), get_jiffies_64());
}
+/*
+ * Create a fuse_mount object with a new superblock (with path->dentry
+ * as the root), and return that mount so it can be auto-mounted on
+ * @path.
+ */
+static struct vfsmount *fuse_dentry_automount(struct path *path)
+{
+ struct fs_context *fsc;
+ struct fuse_mount *parent_fm = get_fuse_mount_super(path->mnt->mnt_sb);
+ struct fuse_conn *fc = parent_fm->fc;
+ struct fuse_mount *fm;
+ struct vfsmount *mnt;
+ struct fuse_inode *mp_fi = get_fuse_inode(d_inode(path->dentry));
+ struct super_block *sb;
+ int err;
+
+ fsc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry);
+ if (IS_ERR(fsc)) {
+ err = PTR_ERR(fsc);
+ goto out;
+ }
+
+ err = -ENOMEM;
+ fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL);
+ if (!fm)
+ goto out_put_fsc;
+
+ refcount_set(&fm->count, 1);
+ fsc->s_fs_info = fm;
+ sb = sget_fc(fsc, NULL, set_anon_super_fc);
+ if (IS_ERR(sb)) {
+ err = PTR_ERR(sb);
+ fuse_mount_put(fm);
+ goto out_put_fsc;
+ }
+ fm->fc = fuse_conn_get(fc);
+
+ /* Initialize superblock, making @mp_fi its root */
+ err = fuse_fill_super_submount(sb, mp_fi);
+ if (err)
+ goto out_put_sb;
+
+ sb->s_flags |= SB_ACTIVE;
+ fsc->root = dget(sb->s_root);
+ /* We are done configuring the superblock, so unlock it */
+ up_write(&sb->s_umount);
+
+ down_write(&fc->killsb);
+ list_add_tail(&fm->fc_entry, &fc->mounts);
+ up_write(&fc->killsb);
+
+ /* Create the submount */
+ mnt = vfs_create_mount(fsc);
+ if (IS_ERR(mnt)) {
+ err = PTR_ERR(mnt);
+ goto out_put_fsc;
+ }
+ mntget(mnt);
+ put_fs_context(fsc);
+ return mnt;
+
+out_put_sb:
+ /*
+ * Only jump here when fsc->root is NULL and sb is still locked
+ * (otherwise put_fs_context() will put the superblock)
+ */
+ deactivate_locked_super(sb);
+out_put_fsc:
+ put_fs_context(fsc);
+out:
+ return ERR_PTR(err);
+}
+
const struct dentry_operations fuse_dentry_operations = {
.d_revalidate = fuse_dentry_revalidate,
.d_delete = fuse_dentry_delete,
.d_init = fuse_dentry_init,
.d_release = fuse_dentry_release,
#endif
+ .d_automount = fuse_dentry_automount,
};
const struct dentry_operations fuse_root_dentry_operations = {
int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name,
struct fuse_entry_out *outarg, struct inode **inode)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
FUSE_ARGS(args);
struct fuse_forget_link *forget;
u64 attr_version;
if (!forget)
goto out;
- attr_version = fuse_get_attr_version(fc);
+ attr_version = fuse_get_attr_version(fm->fc);
- fuse_lookup_init(fc, &args, nodeid, name, outarg);
- err = fuse_simple_request(fc, &args);
+ fuse_lookup_init(fm->fc, &args, nodeid, name, outarg);
+ err = fuse_simple_request(fm, &args);
/* Zero nodeid is same as -ENOENT, but with valid timeout */
if (err || !outarg->nodeid)
goto out_put_forget;
attr_version);
err = -ENOMEM;
if (!*inode) {
- fuse_queue_forget(fc, forget, outarg->nodeid, 1);
+ fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1);
goto out;
}
err = 0;
{
int err;
struct inode *inode;
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
struct fuse_forget_link *forget;
struct fuse_create_in inarg;
goto out_err;
err = -ENOMEM;
- ff = fuse_file_alloc(fc);
+ ff = fuse_file_alloc(fm);
if (!ff)
goto out_put_forget_req;
- if (!fc->dont_mask)
+ if (!fm->fc->dont_mask)
mode &= ~current_umask();
flags &= ~O_NOCTTY;
args.out_args[0].value = &outentry;
args.out_args[1].size = sizeof(outopen);
args.out_args[1].value = &outopen;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err)
goto out_free_ff;
if (!inode) {
flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
fuse_sync_release(NULL, ff, flags);
- fuse_queue_forget(fc, forget, outentry.nodeid, 1);
+ fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1);
err = -ENOMEM;
goto out_err;
}
/*
* Code shared between mknod, mkdir, symlink and link
*/
-static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
+static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
struct inode *dir, struct dentry *entry,
umode_t mode)
{
args->out_numargs = 1;
args->out_args[0].size = sizeof(outarg);
args->out_args[0].value = &outarg;
- err = fuse_simple_request(fc, args);
+ err = fuse_simple_request(fm, args);
if (err)
goto out_put_forget_req;
inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
&outarg.attr, entry_attr_timeout(&outarg), 0);
if (!inode) {
- fuse_queue_forget(fc, forget, outarg.nodeid, 1);
+ fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1);
return -ENOMEM;
}
kfree(forget);
dev_t rdev)
{
struct fuse_mknod_in inarg;
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
- if (!fc->dont_mask)
+ if (!fm->fc->dont_mask)
mode &= ~current_umask();
memset(&inarg, 0, sizeof(inarg));
args.in_args[0].value = &inarg;
args.in_args[1].size = entry->d_name.len + 1;
args.in_args[1].value = entry->d_name.name;
- return create_new_entry(fc, &args, dir, entry, mode);
+ return create_new_entry(fm, &args, dir, entry, mode);
}
static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode,
static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
{
struct fuse_mkdir_in inarg;
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
- if (!fc->dont_mask)
+ if (!fm->fc->dont_mask)
mode &= ~current_umask();
memset(&inarg, 0, sizeof(inarg));
args.in_args[0].value = &inarg;
args.in_args[1].size = entry->d_name.len + 1;
args.in_args[1].value = entry->d_name.name;
- return create_new_entry(fc, &args, dir, entry, S_IFDIR);
+ return create_new_entry(fm, &args, dir, entry, S_IFDIR);
}
static int fuse_symlink(struct inode *dir, struct dentry *entry,
const char *link)
{
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
unsigned len = strlen(link) + 1;
FUSE_ARGS(args);
args.in_args[0].value = entry->d_name.name;
args.in_args[1].size = len;
args.in_args[1].value = link;
- return create_new_entry(fc, &args, dir, entry, S_IFLNK);
+ return create_new_entry(fm, &args, dir, entry, S_IFLNK);
}
void fuse_update_ctime(struct inode *inode)
static int fuse_unlink(struct inode *dir, struct dentry *entry)
{
int err;
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
args.opcode = FUSE_UNLINK;
args.in_numargs = 1;
args.in_args[0].size = entry->d_name.len + 1;
args.in_args[0].value = entry->d_name.name;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err) {
struct inode *inode = d_inode(entry);
struct fuse_inode *fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
- fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ fi->attr_version = atomic64_inc_return(&fm->fc->attr_version);
/*
* If i_nlink == 0 then unlink doesn't make sense, yet this can
* happen if userspace filesystem is careless. It would be
static int fuse_rmdir(struct inode *dir, struct dentry *entry)
{
int err;
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
args.opcode = FUSE_RMDIR;
args.in_numargs = 1;
args.in_args[0].size = entry->d_name.len + 1;
args.in_args[0].value = entry->d_name.name;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err) {
clear_nlink(d_inode(entry));
fuse_dir_changed(dir);
{
int err;
struct fuse_rename2_in inarg;
- struct fuse_conn *fc = get_fuse_conn(olddir);
+ struct fuse_mount *fm = get_fuse_mount(olddir);
FUSE_ARGS(args);
memset(&inarg, 0, argsize);
args.in_args[1].value = oldent->d_name.name;
args.in_args[2].size = newent->d_name.len + 1;
args.in_args[2].value = newent->d_name.name;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err) {
/* ctime changes */
fuse_invalidate_attr(d_inode(oldent));
int err;
struct fuse_link_in inarg;
struct inode *inode = d_inode(entry);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
memset(&inarg, 0, sizeof(inarg));
args.in_args[0].value = &inarg;
args.in_args[1].size = newent->d_name.len + 1;
args.in_args[1].value = newent->d_name.name;
- err = create_new_entry(fc, &args, newdir, newent, inode->i_mode);
+ err = create_new_entry(fm, &args, newdir, newent, inode->i_mode);
/* Contrary to "normal" filesystems it can happen that link
makes two "logical" inodes point to the same "physical"
inode. We invalidate the attributes of the old one, so it
struct fuse_inode *fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
- fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ fi->attr_version = atomic64_inc_return(&fm->fc->attr_version);
if (likely(inode->i_nlink < UINT_MAX))
inc_nlink(inode);
spin_unlock(&fi->lock);
int err;
struct fuse_getattr_in inarg;
struct fuse_attr_out outarg;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
u64 attr_version;
- attr_version = fuse_get_attr_version(fc);
+ attr_version = fuse_get_attr_version(fm->fc);
memset(&inarg, 0, sizeof(inarg));
memset(&outarg, 0, sizeof(outarg));
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err) {
if (fuse_invalid_attr(&outarg.attr) ||
(inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
STATX_BASIC_STATS & ~STATX_ATIME, 0);
}
-int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
u64 child_nodeid, struct qstr *name)
{
int err = -ENOTDIR;
struct dentry *dir;
struct dentry *entry;
- parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid);
+ parent = fuse_ilookup(fc, parent_nodeid, NULL);
if (!parent)
return -ENOENT;
static int fuse_access(struct inode *inode, int mask)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_access_in inarg;
int err;
BUG_ON(mask & MAY_NOT_BLOCK);
- if (fc->no_access)
+ if (fm->fc->no_access)
return 0;
memset(&inarg, 0, sizeof(inarg));
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_access = 1;
+ fm->fc->no_access = 1;
err = 0;
}
return err;
static int fuse_readlink_page(struct inode *inode, struct page *page)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 };
struct fuse_args_pages ap = {
.num_pages = 1,
ap.args.page_zeroing = true;
ap.args.out_numargs = 1;
ap.args.out_args[0].size = desc.length;
- res = fuse_simple_request(fc, &ap.args);
+ res = fuse_simple_request(fm, &ap.args);
fuse_invalidate_atime(inode);
*/
int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_setattr_in inarg;
struct fuse_attr_out outarg;
inarg.valid = FATTR_MTIME;
inarg.mtime = inode->i_mtime.tv_sec;
inarg.mtimensec = inode->i_mtime.tv_nsec;
- if (fc->minor >= 23) {
+ if (fm->fc->minor >= 23) {
inarg.valid |= FATTR_CTIME;
inarg.ctime = inode->i_ctime.tv_sec;
inarg.ctimensec = inode->i_ctime.tv_nsec;
inarg.valid |= FATTR_FH;
inarg.fh = ff->fh;
}
- fuse_setattr_fill(fc, &args, inode, &inarg, &outarg);
+ fuse_setattr_fill(fm->fc, &args, inode, &inarg, &outarg);
- return fuse_simple_request(fc, &args);
+ return fuse_simple_request(fm, &args);
}
/*
struct file *file)
{
struct inode *inode = d_inode(dentry);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_conn *fc = fm->fc;
struct fuse_inode *fi = get_fuse_inode(inode);
FUSE_ARGS(args);
struct fuse_setattr_in inarg;
loff_t oldsize;
int err;
bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode);
+ bool fault_blocked = false;
if (!fc->default_permissions)
attr->ia_valid |= ATTR_FORCE;
if (err)
return err;
+ if (attr->ia_valid & ATTR_SIZE) {
+ if (WARN_ON(!S_ISREG(inode->i_mode)))
+ return -EIO;
+ is_truncate = true;
+ }
+
+ if (FUSE_IS_DAX(inode) && is_truncate) {
+ down_write(&fi->i_mmap_sem);
+ fault_blocked = true;
+ err = fuse_dax_break_layouts(inode, 0, 0);
+ if (err) {
+ up_write(&fi->i_mmap_sem);
+ return err;
+ }
+ }
+
if (attr->ia_valid & ATTR_OPEN) {
/* This is coming from open(..., ... | O_TRUNC); */
WARN_ON(!(attr->ia_valid & ATTR_SIZE));
*/
i_size_write(inode, 0);
truncate_pagecache(inode, 0);
- return 0;
+ goto out;
}
file = NULL;
}
- if (attr->ia_valid & ATTR_SIZE) {
- if (WARN_ON(!S_ISREG(inode->i_mode)))
- return -EIO;
- is_truncate = true;
- }
-
/* Flush dirty data/metadata before non-truncate SETATTR */
if (is_wb && S_ISREG(inode->i_mode) &&
attr->ia_valid &
inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
}
fuse_setattr_fill(fc, &args, inode, &inarg, &outarg);
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err) {
if (err == -EINTR)
fuse_invalidate_attr(inode);
}
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+out:
+ if (fault_blocked)
+ up_write(&fi->i_mmap_sem);
+
return 0;
error:
fuse_release_nowrite(inode);
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
+ if (fault_blocked)
+ up_write(&fi->i_mmap_sem);
return err;
}
return pages;
}
-static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
int opcode, struct fuse_open_out *outargp)
{
struct fuse_open_in inarg;
memset(&inarg, 0, sizeof(inarg));
inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
- if (!fc->atomic_o_trunc)
+ if (!fm->fc->atomic_o_trunc)
inarg.flags &= ~O_TRUNC;
args.opcode = opcode;
args.nodeid = nodeid;
args.out_args[0].size = sizeof(*outargp);
args.out_args[0].value = outargp;
- return fuse_simple_request(fc, &args);
+ return fuse_simple_request(fm, &args);
}
struct fuse_release_args {
struct inode *inode;
};
-struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
+struct fuse_file *fuse_file_alloc(struct fuse_mount *fm)
{
struct fuse_file *ff;
if (unlikely(!ff))
return NULL;
- ff->fc = fc;
+ ff->fm = fm;
ff->release_args = kzalloc(sizeof(*ff->release_args),
GFP_KERNEL_ACCOUNT);
if (!ff->release_args) {
RB_CLEAR_NODE(&ff->polled_node);
init_waitqueue_head(&ff->poll_wait);
- ff->kh = atomic64_inc_return(&fc->khctr);
+ ff->kh = atomic64_inc_return(&fm->fc->khctr);
return ff;
}
return ff;
}
-static void fuse_release_end(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args,
int error)
{
struct fuse_release_args *ra = container_of(args, typeof(*ra), args);
if (refcount_dec_and_test(&ff->count)) {
struct fuse_args *args = &ff->release_args->args;
- if (isdir ? ff->fc->no_opendir : ff->fc->no_open) {
+ if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) {
/* Do nothing when client does not implement 'open' */
- fuse_release_end(ff->fc, args, 0);
+ fuse_release_end(ff->fm, args, 0);
} else if (sync) {
- fuse_simple_request(ff->fc, args);
- fuse_release_end(ff->fc, args, 0);
+ fuse_simple_request(ff->fm, args);
+ fuse_release_end(ff->fm, args, 0);
} else {
args->end = fuse_release_end;
- if (fuse_simple_background(ff->fc, args,
+ if (fuse_simple_background(ff->fm, args,
GFP_KERNEL | __GFP_NOFAIL))
- fuse_release_end(ff->fc, args, -ENOTCONN);
+ fuse_release_end(ff->fm, args, -ENOTCONN);
}
kfree(ff);
}
}
-int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
bool isdir)
{
+ struct fuse_conn *fc = fm->fc;
struct fuse_file *ff;
int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
- ff = fuse_file_alloc(fc);
+ ff = fuse_file_alloc(fm);
if (!ff)
return -ENOMEM;
struct fuse_open_out outarg;
int err;
- err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
+ err = fuse_send_open(fm, nodeid, file, opcode, &outarg);
if (!err) {
ff->fh = outarg.fh;
ff->open_flags = outarg.open_flags;
int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_conn *fc = fm->fc;
int err;
bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
fc->atomic_o_trunc &&
fc->writeback_cache;
+ bool dax_truncate = (file->f_flags & O_TRUNC) &&
+ fc->atomic_o_trunc && FUSE_IS_DAX(inode);
err = generic_file_open(inode, file);
if (err)
return err;
- if (is_wb_truncate) {
+ if (is_wb_truncate || dax_truncate) {
inode_lock(inode);
fuse_set_nowrite(inode);
}
- err = fuse_do_open(fc, get_node_id(inode), file, isdir);
+ if (dax_truncate) {
+ down_write(&get_fuse_inode(inode)->i_mmap_sem);
+ err = fuse_dax_break_layouts(inode, 0, 0);
+ if (err)
+ goto out;
+ }
+ err = fuse_do_open(fm, get_node_id(inode), file, isdir);
if (!err)
fuse_finish_open(inode, file);
- if (is_wb_truncate) {
+out:
+ if (dax_truncate)
+ up_write(&get_fuse_inode(inode)->i_mmap_sem);
+
+ if (is_wb_truncate | dax_truncate) {
fuse_release_nowrite(inode);
inode_unlock(inode);
}
static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
int flags, int opcode)
{
- struct fuse_conn *fc = ff->fc;
+ struct fuse_conn *fc = ff->fm->fc;
struct fuse_release_args *ra = ff->release_args;
/* Inode is NULL on error path of fuse_create_open() */
if (ff->flock) {
ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
- ra->inarg.lock_owner = fuse_lock_owner_id(ff->fc,
+ ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc,
(fl_owner_t) file);
}
/* Hold inode until release is finished */
* synchronous RELEASE is allowed (and desirable) in this case
* because the server can be trusted not to screw up.
*/
- fuse_file_put(ff, ff->fc->destroy, isdir);
+ fuse_file_put(ff, ff->fm->fc->destroy, isdir);
}
static int fuse_open(struct inode *inode, struct file *file)
static int fuse_flush(struct file *file, fl_owner_t id)
{
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_file *ff = file->private_data;
struct fuse_flush_in inarg;
FUSE_ARGS(args);
return err;
err = 0;
- if (fc->no_flush)
+ if (fm->fc->no_flush)
goto inval_attr_out;
memset(&inarg, 0, sizeof(inarg));
inarg.fh = ff->fh;
- inarg.lock_owner = fuse_lock_owner_id(fc, id);
+ inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
args.opcode = FUSE_FLUSH;
args.nodeid = get_node_id(inode);
args.in_numargs = 1;
args.in_args[0].value = &inarg;
args.force = true;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_flush = 1;
+ fm->fc->no_flush = 1;
err = 0;
}
* In memory i_blocks is not maintained by fuse, if writeback cache is
* enabled, i_blocks from cached attr may not be accurate.
*/
- if (!err && fc->writeback_cache)
+ if (!err && fm->fc->writeback_cache)
fuse_invalidate_attr(inode);
return err;
}
int datasync, int opcode)
{
struct inode *inode = file->f_mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_file *ff = file->private_data;
FUSE_ARGS(args);
struct fuse_fsync_in inarg;
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
- return fuse_simple_request(fc, &args);
+ return fuse_simple_request(fm, &args);
}
static int fuse_fsync(struct file *file, loff_t start, loff_t end,
kfree(ia);
}
-static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args,
int err)
{
struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
fuse_io_free(ia);
}
-static ssize_t fuse_async_req_send(struct fuse_conn *fc,
+static ssize_t fuse_async_req_send(struct fuse_mount *fm,
struct fuse_io_args *ia, size_t num_bytes)
{
ssize_t err;
ia->ap.args.end = fuse_aio_complete_req;
ia->ap.args.may_block = io->should_dirty;
- err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL);
+ err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL);
if (err)
- fuse_aio_complete_req(fc, &ia->ap.args, err);
+ fuse_aio_complete_req(fm, &ia->ap.args, err);
return num_bytes;
}
{
struct file *file = ia->io->iocb->ki_filp;
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
if (owner != NULL) {
ia->read.in.read_flags |= FUSE_READ_LOCKOWNER;
- ia->read.in.lock_owner = fuse_lock_owner_id(fc, owner);
+ ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner);
}
if (ia->io->async)
- return fuse_async_req_send(fc, ia, count);
+ return fuse_async_req_send(fm, ia, count);
- return fuse_simple_request(fc, &ia->ap.args);
+ return fuse_simple_request(fm, &ia->ap.args);
}
static void fuse_read_update_size(struct inode *inode, loff_t size,
static int fuse_do_readpage(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
loff_t pos = page_offset(page);
struct fuse_page_desc desc = { .length = PAGE_SIZE };
struct fuse_io_args ia = {
*/
fuse_wait_on_page_writeback(inode, page->index);
- attr_ver = fuse_get_attr_version(fc);
+ attr_ver = fuse_get_attr_version(fm->fc);
/* Don't overflow end offset */
if (pos + (desc.length - 1) == LLONG_MAX)
desc.length--;
fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
- res = fuse_simple_request(fc, &ia.ap.args);
+ res = fuse_simple_request(fm, &ia.ap.args);
if (res < 0)
return res;
/*
return err;
}
-static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
int err)
{
int i;
static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
{
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
struct fuse_args_pages *ap = &ia->ap;
loff_t pos = page_offset(ap->pages[0]);
size_t count = ap->num_pages << PAGE_SHIFT;
WARN_ON((loff_t) (pos + count) < 0);
fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
- ia->read.attr_ver = fuse_get_attr_version(fc);
- if (fc->async_read) {
+ ia->read.attr_ver = fuse_get_attr_version(fm->fc);
+ if (fm->fc->async_read) {
ia->ff = fuse_file_get(ff);
ap->args.end = fuse_readpages_end;
- err = fuse_simple_background(fc, &ap->args, GFP_KERNEL);
+ err = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
if (!err)
return;
} else {
- res = fuse_simple_request(fc, &ap->args);
+ res = fuse_simple_request(fm, &ap->args);
err = res < 0 ? res : 0;
}
- fuse_readpages_end(fc, &ap->args, err);
+ fuse_readpages_end(fm, &ap->args, err);
}
static void fuse_readahead(struct readahead_control *rac)
args->opcode = FUSE_WRITE;
args->nodeid = ff->nodeid;
args->in_numargs = 2;
- if (ff->fc->minor < 9)
+ if (ff->fm->fc->minor < 9)
args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
else
args->in_args[0].size = sizeof(ia->write.in);
struct kiocb *iocb = ia->io->iocb;
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
struct fuse_write_in *inarg = &ia->write.in;
ssize_t err;
inarg->flags = fuse_write_flags(iocb);
if (owner != NULL) {
inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
- inarg->lock_owner = fuse_lock_owner_id(fc, owner);
+ inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner);
}
if (ia->io->async)
- return fuse_async_req_send(fc, ia, count);
+ return fuse_async_req_send(fm, ia, count);
- err = fuse_simple_request(fc, &ia->ap.args);
+ err = fuse_simple_request(fm, &ia->ap.args);
if (!err && ia->write.out.size > count)
err = -EIO;
struct fuse_args_pages *ap = &ia->ap;
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
unsigned int offset, i;
int err;
fuse_write_args_fill(ia, ff, pos, count);
ia->write.in.flags = fuse_write_flags(iocb);
- err = fuse_simple_request(fc, &ap->args);
+ err = fuse_simple_request(fm, &ap->args);
if (!err && ia->write.out.size > count)
err = -EIO;
struct file *file = io->iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_conn *fc = ff->fm->fc;
size_t nmax = write ? fc->max_write : fc->max_read;
loff_t pos = *ppos;
size_t count = iov_iter_count(iter);
{
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
+ struct inode *inode = file_inode(file);
- if (is_bad_inode(file_inode(file)))
+ if (is_bad_inode(inode))
return -EIO;
+ if (FUSE_IS_DAX(inode))
+ return fuse_dax_read_iter(iocb, to);
+
if (!(ff->open_flags & FOPEN_DIRECT_IO))
return fuse_cache_read_iter(iocb, to);
else
{
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
+ struct inode *inode = file_inode(file);
- if (is_bad_inode(file_inode(file)))
+ if (is_bad_inode(inode))
return -EIO;
+ if (FUSE_IS_DAX(inode))
+ return fuse_dax_write_iter(iocb, from);
+
if (!(ff->open_flags & FOPEN_DIRECT_IO))
return fuse_cache_write_iter(iocb, from);
else
kfree(wpa);
}
-static void fuse_writepage_finish(struct fuse_conn *fc,
+static void fuse_writepage_finish(struct fuse_mount *fm,
struct fuse_writepage_args *wpa)
{
struct fuse_args_pages *ap = &wpa->ia.ap;
}
/* Called under fi->lock, may release and reacquire it */
-static void fuse_send_writepage(struct fuse_conn *fc,
+static void fuse_send_writepage(struct fuse_mount *fm,
struct fuse_writepage_args *wpa, loff_t size)
__releases(fi->lock)
__acquires(fi->lock)
args->force = true;
args->nocreds = true;
- err = fuse_simple_background(fc, args, GFP_ATOMIC);
+ err = fuse_simple_background(fm, args, GFP_ATOMIC);
if (err == -ENOMEM) {
spin_unlock(&fi->lock);
- err = fuse_simple_background(fc, args, GFP_NOFS | __GFP_NOFAIL);
+ err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL);
spin_lock(&fi->lock);
}
out_free:
fi->writectr--;
rb_erase(&wpa->writepages_entry, &fi->writepages);
- fuse_writepage_finish(fc, wpa);
+ fuse_writepage_finish(fm, wpa);
spin_unlock(&fi->lock);
/* After fuse_writepage_finish() aux request list is private */
__releases(fi->lock)
__acquires(fi->lock)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
loff_t crop = i_size_read(inode);
struct fuse_writepage_args *wpa;
wpa = list_entry(fi->queued_writes.next,
struct fuse_writepage_args, queue_entry);
list_del_init(&wpa->queue_entry);
- fuse_send_writepage(fc, wpa, crop);
+ fuse_send_writepage(fm, wpa, crop);
}
}
WARN_ON(fuse_insert_writeback(root, wpa));
}
-static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
int error)
{
struct fuse_writepage_args *wpa =
spin_lock(&fi->lock);
rb_erase(&wpa->writepages_entry, &fi->writepages);
while (wpa->next) {
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_write_in *inarg = &wpa->ia.write.in;
struct fuse_writepage_args *next = wpa->next;
* no invocations of fuse_writepage_end() while we're in
* fuse_set_nowrite..fuse_release_nowrite section.
*/
- fuse_send_writepage(fc, next, inarg->offset + inarg->size);
+ fuse_send_writepage(fm, next, inarg->offset + inarg->size);
}
fi->writectr--;
- fuse_writepage_finish(fc, wpa);
+ fuse_writepage_finish(fm, wpa);
spin_unlock(&fi->lock);
fuse_writepage_free(wpa);
}
{
struct fuse_file *ff = file->private_data;
+ /* DAX mmap is superior to direct_io mmap */
+ if (FUSE_IS_DAX(file_inode(file)))
+ return fuse_dax_mmap(file, vma);
+
if (ff->open_flags & FOPEN_DIRECT_IO) {
/* Can't provide the coherency needed for MAP_SHARED */
if (vma->vm_flags & VM_MAYSHARE)
static int fuse_getlk(struct file *file, struct file_lock *fl)
{
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_lk_in inarg;
struct fuse_lk_out outarg;
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err)
- err = convert_fuse_file_lock(fc, &outarg.lk, fl);
+ err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl);
return err;
}
static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
{
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_lk_in inarg;
int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL;
- pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns);
+ pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns);
int err;
if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
return 0;
fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
/* locking is restartable */
if (err == -EINTR)
static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
{
struct inode *inode = mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_bmap_in inarg;
struct fuse_bmap_out outarg;
int err;
- if (!inode->i_sb->s_bdev || fc->no_bmap)
+ if (!inode->i_sb->s_bdev || fm->fc->no_bmap)
return 0;
memset(&inarg, 0, sizeof(inarg));
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS)
- fc->no_bmap = 1;
+ fm->fc->no_bmap = 1;
return err ? 0 : outarg.block;
}
static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_file *ff = file->private_data;
FUSE_ARGS(args);
struct fuse_lseek_in inarg = {
struct fuse_lseek_out outarg;
int err;
- if (fc->no_lseek)
+ if (fm->fc->no_lseek)
goto fallback;
args.opcode = FUSE_LSEEK;
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err) {
if (err == -ENOSYS) {
- fc->no_lseek = 1;
+ fm->fc->no_lseek = 1;
goto fallback;
}
return err;
unsigned int flags)
{
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
struct fuse_ioctl_in inarg = {
.fh = ff->fh,
.cmd = cmd,
BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
err = -ENOMEM;
- ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs);
+ ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs);
iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
if (!ap.pages || !iov_page)
goto out;
- fuse_page_descs_length_init(ap.descs, 0, fc->max_pages);
+ fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages);
/*
* If restricted, initialize IO parameters as encoded in @cmd.
/* make sure there are enough buffer pages and init request with them */
err = -ENOMEM;
- if (max_pages > fc->max_pages)
+ if (max_pages > fm->fc->max_pages)
goto out;
while (ap.num_pages < max_pages) {
ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
ap.args.out_pages = true;
ap.args.out_argvar = true;
- transferred = fuse_simple_request(fc, &ap.args);
+ transferred = fuse_simple_request(fm, &ap.args);
err = transferred;
if (transferred < 0)
goto out;
goto out;
vaddr = kmap_atomic(ap.pages[0]);
- err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
+ err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr,
transferred, in_iovs + out_iovs,
(flags & FUSE_IOCTL_COMPAT) != 0);
kunmap_atomic(vaddr);
in_iov = iov_page;
out_iov = in_iov + in_iovs;
- err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs);
+ err = fuse_verify_ioctl_iov(fm->fc, in_iov, in_iovs);
if (err)
goto out;
- err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs);
+ err = fuse_verify_ioctl_iov(fm->fc, out_iov, out_iovs);
if (err)
goto out;
__poll_t fuse_file_poll(struct file *file, poll_table *wait)
{
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
struct fuse_poll_out outarg;
FUSE_ARGS(args);
int err;
- if (fc->no_poll)
+ if (fm->fc->no_poll)
return DEFAULT_POLLMASK;
poll_wait(file, &ff->poll_wait, wait);
*/
if (waitqueue_active(&ff->poll_wait)) {
inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
- fuse_register_polled_file(fc, ff);
+ fuse_register_polled_file(fm->fc, ff);
}
args.opcode = FUSE_POLL;
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err)
return demangle_poll(outarg.revents);
if (err == -ENOSYS) {
- fc->no_poll = 1;
+ fm->fc->no_poll = 1;
return DEFAULT_POLLMASK;
}
return EPOLLERR;
* By default, we want to optimize all I/Os with async request
* submission to the client filesystem if supported.
*/
- io->async = ff->fc->async_dio;
+ io->async = ff->fm->fc->async_dio;
io->iocb = iocb;
io->blocking = is_sync_kiocb(iocb);
/* optimization for short read */
if (io->async && !io->write && offset + count > i_size) {
- iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset));
+ iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset));
shortened = count - iov_iter_count(iter);
count -= shortened;
}
struct fuse_file *ff = file->private_data;
struct inode *inode = file_inode(file);
struct fuse_inode *fi = get_fuse_inode(inode);
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
FUSE_ARGS(args);
struct fuse_fallocate_in inarg = {
.fh = ff->fh,
bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
(mode & FALLOC_FL_PUNCH_HOLE);
+ bool block_faults = FUSE_IS_DAX(inode) && lock_inode;
+
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
- if (fc->no_fallocate)
+ if (fm->fc->no_fallocate)
return -EOPNOTSUPP;
if (lock_inode) {
inode_lock(inode);
+ if (block_faults) {
+ down_write(&fi->i_mmap_sem);
+ err = fuse_dax_break_layouts(inode, 0, 0);
+ if (err)
+ goto out;
+ }
+
if (mode & FALLOC_FL_PUNCH_HOLE) {
loff_t endbyte = offset + length - 1;
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_fallocate = 1;
+ fm->fc->no_fallocate = 1;
err = -EOPNOTSUPP;
}
if (err)
if (!(mode & FALLOC_FL_KEEP_SIZE)) {
bool changed = fuse_write_update_size(inode, offset + length);
- if (changed && fc->writeback_cache)
+ if (changed && fm->fc->writeback_cache)
file_update_time(file);
}
if (!(mode & FALLOC_FL_KEEP_SIZE))
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+ if (block_faults)
+ up_write(&fi->i_mmap_sem);
+
if (lock_inode)
inode_unlock(inode);
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
struct fuse_inode *fi_out = get_fuse_inode(inode_out);
- struct fuse_conn *fc = ff_in->fc;
+ struct fuse_mount *fm = ff_in->fm;
+ struct fuse_conn *fc = fm->fc;
FUSE_ARGS(args);
struct fuse_copy_file_range_in inarg = {
.fh_in = ff_in->fh,
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
fc->no_copy_file_range = 1;
err = -EOPNOTSUPP;
.release = fuse_release,
.fsync = fuse_fsync,
.lock = fuse_file_lock,
+ .get_unmapped_area = thp_get_unmapped_area,
.flock = fuse_file_flock,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
fi->writectr = 0;
init_waitqueue_head(&fi->page_waitq);
fi->writepages = RB_ROOT;
+
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_inode_init(inode);
}
/** Lock to protect write related fields */
spinlock_t lock;
+
+ /**
+ * Can't take inode lock in fault path (leads to circular dependency).
+ * Introduce another semaphore which can be taken in fault path and
+ * then other filesystem paths can take this to block faults.
+ */
+ struct rw_semaphore i_mmap_sem;
+
+#ifdef CONFIG_FUSE_DAX
+ /*
+ * Dax specific inode data
+ */
+ struct fuse_inode_dax *dax;
+#endif
};
/** FUSE inode state bits */
};
struct fuse_conn;
+struct fuse_mount;
struct fuse_release_args;
/** FUSE specific file data */
struct fuse_file {
/** Fuse connection for this file */
- struct fuse_conn *fc;
+ struct fuse_mount *fm;
/* Argument space reserved for release */
struct fuse_release_args *release_args;
bool may_block:1;
struct fuse_in_arg in_args[3];
struct fuse_arg out_args[2];
- void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error);
+ void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error);
};
struct fuse_args_pages {
/** virtio-fs's physically contiguous buffer for in and out args */
void *argbuf;
#endif
+
+ /** fuse_mount this request belongs to */
+ struct fuse_mount *fm;
};
struct fuse_iqueue;
bool destroy:1;
bool no_control:1;
bool no_force_umount:1;
- bool no_mount_options:1;
+ bool legacy_opts_show:1;
+ bool dax:1;
unsigned int max_read;
unsigned int blksize;
const char *subtype;
+ /* DAX device, may be NULL */
+ struct dax_device *dax_dev;
+
/* fuse_dev pointer to fill in, should contain NULL on entry */
void **fudptr;
};
/**
* A Fuse connection.
*
- * This structure is created, when the filesystem is mounted, and is
- * destroyed, when the client device is closed and the filesystem is
- * unmounted.
+ * This structure is created, when the root filesystem is mounted, and
+ * is destroyed, when the client device is closed and the last
+ * fuse_mount is destroyed.
*/
struct fuse_conn {
/** Lock protecting accessess to members of this structure */
/** cache READLINK responses in page cache */
unsigned cache_symlinks:1;
+ /* show legacy mount options */
+ unsigned int legacy_opts_show:1;
+
/*
* The following bitfields are only for optimization purposes
* and hence races in setting them will not cause malfunction
/** Do not allow MNT_FORCE umount */
unsigned int no_force_umount:1;
- /* Do not show mount options */
- unsigned int no_mount_options:1;
+ /* Auto-mount submounts announced by the server */
+ unsigned int auto_submounts:1;
/** The number of requests waiting for completion */
atomic_t num_waiting;
/** Negotiated minor version */
unsigned minor;
- /** Entry on the fuse_conn_list */
+ /** Entry on the fuse_mount_list */
struct list_head entry;
- /** Device ID from super block */
+ /** Device ID from the root super block */
dev_t dev;
/** Dentries in the control filesystem */
/** Called on final put */
void (*release)(struct fuse_conn *);
- /** Super block for this connection. */
- struct super_block *sb;
-
- /** Read/write semaphore to hold when accessing sb. */
+ /**
+ * Read/write semaphore to hold when accessing the sb of any
+ * fuse_mount belonging to this connection
+ */
struct rw_semaphore killsb;
/** List of device instances belonging to this connection */
struct list_head devices;
+
+#ifdef CONFIG_FUSE_DAX
+ /* Dax specific conn data, non-NULL if DAX is enabled */
+ struct fuse_conn_dax *dax;
+#endif
+
+ /** List of filesystems using this connection */
+ struct list_head mounts;
};
-static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
+/*
+ * Represents a mounted filesystem, potentially a submount.
+ *
+ * This object allows sharing a fuse_conn between separate mounts to
+ * allow submounts with dedicated superblocks and thus separate device
+ * IDs.
+ */
+struct fuse_mount {
+ /* Underlying (potentially shared) connection to the FUSE server */
+ struct fuse_conn *fc;
+
+ /* Refcount */
+ refcount_t count;
+
+ /*
+ * Super block for this connection (fc->killsb must be held when
+ * accessing this).
+ */
+ struct super_block *sb;
+
+ /* Entry on fc->mounts */
+ struct list_head fc_entry;
+};
+
+static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb)
{
return sb->s_fs_info;
}
+static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+
+ return fm ? fm->fc : NULL;
+}
+
+static inline struct fuse_mount *get_fuse_mount(struct inode *inode)
+{
+ return get_fuse_mount_super(inode->i_sb);
+}
+
static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
{
- return get_fuse_conn_super(inode->i_sb);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+
+ return fm ? fm->fc : NULL;
}
static inline struct fuse_inode *get_fuse_inode(struct inode *inode)
extern const struct dentry_operations fuse_root_dentry_operations;
/**
- * Inode to nodeid comparison.
- */
-int fuse_inode_eq(struct inode *inode, void *_nodeidp);
-
-/**
* Get a filled in inode
*/
struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
*/
int fuse_open_common(struct inode *inode, struct file *file, bool isdir);
-struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
+struct fuse_file *fuse_file_alloc(struct fuse_mount *fm);
void fuse_file_free(struct fuse_file *ff);
void fuse_finish_open(struct inode *inode, struct file *file);
/**
* Simple request sending that does request allocation and freeing
*/
-ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args);
-int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args,
+ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args);
+int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
gfp_t gfp_flags);
/**
* End a finished request
*/
-void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_end(struct fuse_req *req);
/* Abort all requests */
void fuse_abort_conn(struct fuse_conn *fc);
/**
* Initialize fuse_conn
*/
-void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
+void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
+ struct user_namespace *user_ns,
const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv);
/**
*/
void fuse_conn_put(struct fuse_conn *fc);
+/**
+ * Acquire reference to fuse_mount
+ */
+struct fuse_mount *fuse_mount_get(struct fuse_mount *fm);
+
+/**
+ * Release reference to fuse_mount
+ */
+void fuse_mount_put(struct fuse_mount *fm);
+
struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc);
struct fuse_dev *fuse_dev_alloc(void);
void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc);
void fuse_dev_free(struct fuse_dev *fud);
-void fuse_send_init(struct fuse_conn *fc);
+void fuse_send_init(struct fuse_mount *fm);
/**
* Fill in superblock and initialize fuse connection
*/
int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx);
-/**
- * Disassociate fuse connection from superblock and kill the superblock
+/*
+ * Fill in superblock for submounts
+ * @sb: partially-initialized superblock to fill in
+ * @parent_fi: The fuse_inode of the parent filesystem where this submount is
+ * mounted
+ */
+int fuse_fill_super_submount(struct super_block *sb,
+ struct fuse_inode *parent_fi);
+
+/*
+ * Remove the mount from the connection
*
- * Calls kill_anon_super(), do not use with bdev mounts.
+ * Returns whether this was the last mount
*/
-void fuse_kill_sb_anon(struct super_block *sb);
+bool fuse_mount_remove(struct fuse_mount *fm);
+
+/*
+ * Shut down the connection (possibly sending DESTROY request).
+ */
+void fuse_conn_destroy(struct fuse_mount *fm);
/**
* Add connection to control filesystem
void fuse_release_nowrite(struct inode *inode);
/**
+ * Scan all fuse_mounts belonging to fc to find the first where
+ * ilookup5() returns a result. Return that result and the
+ * respective fuse_mount in *fm (unless fm is NULL).
+ *
+ * The caller must hold fc->killsb.
+ */
+struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid,
+ struct fuse_mount **fm);
+
+/**
* File-system tells the kernel to invalidate cache for the given node id.
*/
-int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
loff_t offset, loff_t len);
/**
* - is a file or oan empty directory
* then the dentry is unhashed (d_delete()).
*/
-int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
u64 child_nodeid, struct qstr *name);
-int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
bool isdir);
/**
u64 fuse_get_unique(struct fuse_iqueue *fiq);
void fuse_free_conn(struct fuse_conn *fc);
+/* dax.c */
+
+#define FUSE_IS_DAX(inode) (IS_ENABLED(CONFIG_FUSE_DAX) && IS_DAX(inode))
+
+ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
+ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
+int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma);
+int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end);
+int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev);
+void fuse_dax_conn_free(struct fuse_conn *fc);
+bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi);
+void fuse_dax_inode_init(struct inode *inode);
+void fuse_dax_inode_cleanup(struct inode *inode);
+bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment);
+void fuse_dax_cancel_work(struct fuse_conn *fc);
+
#endif /* _FS_FUSE_I_H */
fi->orig_ino = 0;
fi->state = 0;
mutex_init(&fi->mutex);
+ init_rwsem(&fi->i_mmap_sem);
spin_lock_init(&fi->lock);
fi->forget = fuse_alloc_forget();
- if (!fi->forget) {
- kmem_cache_free(fuse_inode_cachep, fi);
- return NULL;
- }
+ if (!fi->forget)
+ goto out_free;
+
+ if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi))
+ goto out_free_forget;
return &fi->inode;
+
+out_free_forget:
+ kfree(fi->forget);
+out_free:
+ kmem_cache_free(fuse_inode_cachep, fi);
+ return NULL;
}
static void fuse_free_inode(struct inode *inode)
mutex_destroy(&fi->mutex);
kfree(fi->forget);
+#ifdef CONFIG_FUSE_DAX
+ kfree(fi->dax);
+#endif
kmem_cache_free(fuse_inode_cachep, fi);
}
clear_inode(inode);
if (inode->i_sb->s_flags & SB_ACTIVE) {
struct fuse_conn *fc = get_fuse_conn(inode);
- fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
- fi->forget = NULL;
+
+ if (FUSE_IS_DAX(inode))
+ fuse_dax_inode_cleanup(inode);
+ if (fi->nlookup) {
+ fuse_queue_forget(fc, fi->forget, fi->nodeid,
+ fi->nlookup);
+ fi->forget = NULL;
+ }
}
if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) {
WARN_ON(!list_empty(&fi->write_files));
BUG();
}
-int fuse_inode_eq(struct inode *inode, void *_nodeidp)
+static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
{
u64 nodeid = *(u64 *) _nodeidp;
if (get_node_id(inode) == nodeid)
struct fuse_inode *fi;
struct fuse_conn *fc = get_fuse_conn_super(sb);
- retry:
+ /*
+ * Auto mount points get their node id from the submount root, which is
+ * not a unique identifier within this filesystem.
+ *
+ * To avoid conflicts, do not place submount points into the inode hash
+ * table.
+ */
+ if (fc->auto_submounts && (attr->flags & FUSE_ATTR_SUBMOUNT) &&
+ S_ISDIR(attr->mode)) {
+ inode = new_inode(sb);
+ if (!inode)
+ return NULL;
+
+ fuse_init_inode(inode, attr);
+ get_fuse_inode(inode)->nodeid = nodeid;
+ inode->i_flags |= S_AUTOMOUNT;
+ goto done;
+ }
+
+retry:
inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid);
if (!inode)
return NULL;
iput(inode);
goto retry;
}
-
+done:
fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
fi->nlookup++;
return inode;
}
-int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid,
+ struct fuse_mount **fm)
+{
+ struct fuse_mount *fm_iter;
+ struct inode *inode;
+
+ WARN_ON(!rwsem_is_locked(&fc->killsb));
+ list_for_each_entry(fm_iter, &fc->mounts, fc_entry) {
+ if (!fm_iter->sb)
+ continue;
+
+ inode = ilookup5(fm_iter->sb, nodeid, fuse_inode_eq, &nodeid);
+ if (inode) {
+ if (fm)
+ *fm = fm_iter;
+ return inode;
+ }
+ }
+
+ return NULL;
+}
+
+int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
loff_t offset, loff_t len)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
struct fuse_inode *fi;
struct inode *inode;
pgoff_t pg_start;
pgoff_t pg_end;
- inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid);
+ inode = fuse_ilookup(fc, nodeid, NULL);
if (!inode)
return -ENOENT;
fuse_abort_conn(fc);
}
-static void fuse_send_destroy(struct fuse_conn *fc)
+static void fuse_send_destroy(struct fuse_mount *fm)
{
- if (fc->conn_init) {
+ if (fm->fc->conn_init) {
FUSE_ARGS(args);
args.opcode = FUSE_DESTROY;
args.force = true;
args.nocreds = true;
- fuse_simple_request(fc, &args);
+ fuse_simple_request(fm, &args);
}
}
static void fuse_put_super(struct super_block *sb)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
- mutex_lock(&fuse_mutex);
- list_del(&fc->entry);
- fuse_ctl_remove_conn(fc);
- mutex_unlock(&fuse_mutex);
-
- fuse_conn_put(fc);
+ fuse_mount_put(fm);
}
static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
FUSE_ARGS(args);
struct fuse_statfs_out outarg;
int err;
- if (!fuse_allow_current_process(fc)) {
+ if (!fuse_allow_current_process(fm->fc)) {
buf->f_type = FUSE_SUPER_MAGIC;
return 0;
}
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err)
convert_fuse_statfs(buf, &outarg.st);
return err;
struct super_block *sb = root->d_sb;
struct fuse_conn *fc = get_fuse_conn_super(sb);
- if (fc->no_mount_options)
- return 0;
+ if (fc->legacy_opts_show) {
+ seq_printf(m, ",user_id=%u",
+ from_kuid_munged(fc->user_ns, fc->user_id));
+ seq_printf(m, ",group_id=%u",
+ from_kgid_munged(fc->user_ns, fc->group_id));
+ if (fc->default_permissions)
+ seq_puts(m, ",default_permissions");
+ if (fc->allow_other)
+ seq_puts(m, ",allow_other");
+ if (fc->max_read != ~0)
+ seq_printf(m, ",max_read=%u", fc->max_read);
+ if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
+ seq_printf(m, ",blksize=%lu", sb->s_blocksize);
+ }
+#ifdef CONFIG_FUSE_DAX
+ if (fc->dax)
+ seq_puts(m, ",dax");
+#endif
- seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id));
- seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id));
- if (fc->default_permissions)
- seq_puts(m, ",default_permissions");
- if (fc->allow_other)
- seq_puts(m, ",allow_other");
- if (fc->max_read != ~0)
- seq_printf(m, ",max_read=%u", fc->max_read);
- if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
- seq_printf(m, ",blksize=%lu", sb->s_blocksize);
return 0;
}
fpq->connected = 1;
}
-void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
+void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
+ struct user_namespace *user_ns,
const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv)
{
memset(fc, 0, sizeof(*fc));
fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
fc->user_ns = get_user_ns(user_ns);
fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ;
+
+ INIT_LIST_HEAD(&fc->mounts);
+ list_add(&fm->fc_entry, &fc->mounts);
+ fm->fc = fc;
+ refcount_set(&fm->count, 1);
}
EXPORT_SYMBOL_GPL(fuse_conn_init);
if (refcount_dec_and_test(&fc->count)) {
struct fuse_iqueue *fiq = &fc->iq;
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_conn_free(fc);
if (fiq->ops->release)
fiq->ops->release(fiq);
put_pid_ns(fc->pid_ns);
}
EXPORT_SYMBOL_GPL(fuse_conn_get);
+void fuse_mount_put(struct fuse_mount *fm)
+{
+ if (refcount_dec_and_test(&fm->count)) {
+ if (fm->fc)
+ fuse_conn_put(fm->fc);
+ kfree(fm);
+ }
+}
+EXPORT_SYMBOL_GPL(fuse_mount_put);
+
+struct fuse_mount *fuse_mount_get(struct fuse_mount *fm)
+{
+ refcount_inc(&fm->count);
+ return fm;
+}
+EXPORT_SYMBOL_GPL(fuse_mount_get);
+
static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
{
struct fuse_attr attr;
struct fuse_init_out out;
};
-static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
+static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
int error)
{
+ struct fuse_conn *fc = fm->fc;
struct fuse_init_args *ia = container_of(args, typeof(*ia), args);
struct fuse_init_out *arg = &ia->out;
+ bool ok = true;
if (error || arg->major != FUSE_KERNEL_VERSION)
- fc->conn_error = 1;
+ ok = false;
else {
unsigned long ra_pages;
if (arg->flags & FUSE_HANDLE_KILLPRIV)
fc->handle_killpriv = 1;
if (arg->time_gran && arg->time_gran <= 1000000000)
- fc->sb->s_time_gran = arg->time_gran;
+ fm->sb->s_time_gran = arg->time_gran;
if ((arg->flags & FUSE_POSIX_ACL)) {
fc->default_permissions = 1;
fc->posix_acl = 1;
- fc->sb->s_xattr = fuse_acl_xattr_handlers;
+ fm->sb->s_xattr = fuse_acl_xattr_handlers;
}
if (arg->flags & FUSE_CACHE_SYMLINKS)
fc->cache_symlinks = 1;
min_t(unsigned int, FUSE_MAX_MAX_PAGES,
max_t(unsigned int, arg->max_pages, 1));
}
+ if (IS_ENABLED(CONFIG_FUSE_DAX) &&
+ arg->flags & FUSE_MAP_ALIGNMENT &&
+ !fuse_dax_check_alignment(fc, arg->map_alignment)) {
+ ok = false;
+ }
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
fc->no_flock = 1;
}
- fc->sb->s_bdi->ra_pages =
- min(fc->sb->s_bdi->ra_pages, ra_pages);
+ fm->sb->s_bdi->ra_pages =
+ min(fm->sb->s_bdi->ra_pages, ra_pages);
fc->minor = arg->minor;
fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
fc->max_write = max_t(unsigned, 4096, fc->max_write);
}
kfree(ia);
+ if (!ok) {
+ fc->conn_init = 0;
+ fc->conn_error = 1;
+ }
+
fuse_set_initialized(fc);
wake_up_all(&fc->blocked_waitq);
}
-void fuse_send_init(struct fuse_conn *fc)
+void fuse_send_init(struct fuse_mount *fm)
{
struct fuse_init_args *ia;
ia->in.major = FUSE_KERNEL_VERSION;
ia->in.minor = FUSE_KERNEL_MINOR_VERSION;
- ia->in.max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE;
+ ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE;
ia->in.flags |=
FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA;
+#ifdef CONFIG_FUSE_DAX
+ if (fm->fc->dax)
+ ia->in.flags |= FUSE_MAP_ALIGNMENT;
+#endif
+ if (fm->fc->auto_submounts)
+ ia->in.flags |= FUSE_SUBMOUNTS;
+
ia->args.opcode = FUSE_INIT;
ia->args.in_numargs = 1;
ia->args.in_args[0].size = sizeof(ia->in);
ia->args.nocreds = true;
ia->args.end = process_init_reply;
- if (fuse_simple_background(fc, &ia->args, GFP_KERNEL) != 0)
- process_init_reply(fc, &ia->args, -ENOTCONN);
+ if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0)
+ process_init_reply(fm, &ia->args, -ENOTCONN);
}
EXPORT_SYMBOL_GPL(fuse_send_init);
}
EXPORT_SYMBOL_GPL(fuse_dev_free);
+static void fuse_fill_attr_from_inode(struct fuse_attr *attr,
+ const struct fuse_inode *fi)
+{
+ *attr = (struct fuse_attr){
+ .ino = fi->inode.i_ino,
+ .size = fi->inode.i_size,
+ .blocks = fi->inode.i_blocks,
+ .atime = fi->inode.i_atime.tv_sec,
+ .mtime = fi->inode.i_mtime.tv_sec,
+ .ctime = fi->inode.i_ctime.tv_sec,
+ .atimensec = fi->inode.i_atime.tv_nsec,
+ .mtimensec = fi->inode.i_mtime.tv_nsec,
+ .ctimensec = fi->inode.i_ctime.tv_nsec,
+ .mode = fi->inode.i_mode,
+ .nlink = fi->inode.i_nlink,
+ .uid = fi->inode.i_uid.val,
+ .gid = fi->inode.i_gid.val,
+ .rdev = fi->inode.i_rdev,
+ .blksize = 1u << fi->inode.i_blkbits,
+ };
+}
+
+static void fuse_sb_defaults(struct super_block *sb)
+{
+ sb->s_magic = FUSE_SUPER_MAGIC;
+ sb->s_op = &fuse_super_operations;
+ sb->s_xattr = fuse_xattr_handlers;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_time_gran = 1;
+ sb->s_export_op = &fuse_export_operations;
+ sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE;
+ if (sb->s_user_ns != &init_user_ns)
+ sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
+ sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
+
+ /*
+ * If we are not in the initial user namespace posix
+ * acls must be translated.
+ */
+ if (sb->s_user_ns != &init_user_ns)
+ sb->s_xattr = fuse_no_acl_xattr_handlers;
+}
+
+int fuse_fill_super_submount(struct super_block *sb,
+ struct fuse_inode *parent_fi)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ struct super_block *parent_sb = parent_fi->inode.i_sb;
+ struct fuse_attr root_attr;
+ struct inode *root;
+
+ fuse_sb_defaults(sb);
+ fm->sb = sb;
+
+ WARN_ON(sb->s_bdi != &noop_backing_dev_info);
+ sb->s_bdi = bdi_get(parent_sb->s_bdi);
+
+ sb->s_xattr = parent_sb->s_xattr;
+ sb->s_time_gran = parent_sb->s_time_gran;
+ sb->s_blocksize = parent_sb->s_blocksize;
+ sb->s_blocksize_bits = parent_sb->s_blocksize_bits;
+ sb->s_subtype = kstrdup(parent_sb->s_subtype, GFP_KERNEL);
+ if (parent_sb->s_subtype && !sb->s_subtype)
+ return -ENOMEM;
+
+ fuse_fill_attr_from_inode(&root_attr, parent_fi);
+ root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0);
+ /*
+ * This inode is just a duplicate, so it is not looked up and
+ * its nlookup should not be incremented. fuse_iget() does
+ * that, though, so undo it here.
+ */
+ get_fuse_inode(root)->nlookup--;
+ sb->s_d_op = &fuse_dentry_operations;
+ sb->s_root = d_make_root(root);
+ if (!sb->s_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
{
struct fuse_dev *fud = NULL;
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ struct fuse_conn *fc = fm->fc;
struct inode *root;
struct dentry *root_dentry;
int err;
if (sb->s_flags & SB_MANDLOCK)
goto err;
- sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
+ fuse_sb_defaults(sb);
if (ctx->is_bdev) {
#ifdef CONFIG_BLOCK
sb->s_subtype = ctx->subtype;
ctx->subtype = NULL;
- sb->s_magic = FUSE_SUPER_MAGIC;
- sb->s_op = &fuse_super_operations;
- sb->s_xattr = fuse_xattr_handlers;
- sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_time_gran = 1;
- sb->s_export_op = &fuse_export_operations;
- sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE;
- if (sb->s_user_ns != &init_user_ns)
- sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
-
- /*
- * If we are not in the initial user namespace posix
- * acls must be translated.
- */
- if (sb->s_user_ns != &init_user_ns)
- sb->s_xattr = fuse_no_acl_xattr_handlers;
+ if (IS_ENABLED(CONFIG_FUSE_DAX)) {
+ err = fuse_dax_conn_alloc(fc, ctx->dax_dev);
+ if (err)
+ goto err;
+ }
if (ctx->fudptr) {
err = -ENOMEM;
fud = fuse_dev_alloc_install(fc);
if (!fud)
- goto err;
+ goto err_free_dax;
}
fc->dev = sb->s_dev;
- fc->sb = sb;
+ fm->sb = sb;
err = fuse_bdi_init(fc, sb);
if (err)
goto err_dev_free;
fc->allow_other = ctx->allow_other;
fc->user_id = ctx->user_id;
fc->group_id = ctx->group_id;
- fc->max_read = max_t(unsigned, 4096, ctx->max_read);
+ fc->legacy_opts_show = ctx->legacy_opts_show;
+ fc->max_read = max_t(unsigned int, 4096, ctx->max_read);
fc->destroy = ctx->destroy;
fc->no_control = ctx->no_control;
fc->no_force_umount = ctx->no_force_umount;
- fc->no_mount_options = ctx->no_mount_options;
err = -ENOMEM;
root = fuse_get_root_inode(sb, ctx->rootmode);
err_dev_free:
if (fud)
fuse_dev_free(fud);
+ err_free_dax:
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_conn_free(fc);
err:
return err;
}
struct file *file;
int err;
struct fuse_conn *fc;
+ struct fuse_mount *fm;
err = -EINVAL;
file = fget(ctx->fd);
if (!fc)
goto err_fput;
- fuse_conn_init(fc, sb->s_user_ns, &fuse_dev_fiq_ops, NULL);
+ fm = kzalloc(sizeof(*fm), GFP_KERNEL);
+ if (!fm) {
+ kfree(fc);
+ goto err_fput;
+ }
+
+ fuse_conn_init(fc, fm, sb->s_user_ns, &fuse_dev_fiq_ops, NULL);
fc->release = fuse_free_conn;
- sb->s_fs_info = fc;
+
+ sb->s_fs_info = fm;
err = fuse_fill_super_common(sb, ctx);
if (err)
* CPUs after this
*/
fput(file);
- fuse_send_init(get_fuse_conn_super(sb));
+ fuse_send_init(get_fuse_mount_super(sb));
return 0;
err_put_conn:
- fuse_conn_put(fc);
+ fuse_mount_put(fm);
sb->s_fs_info = NULL;
err_fput:
fput(file);
ctx->max_read = ~0;
ctx->blksize = FUSE_DEFAULT_BLKSIZE;
+ ctx->legacy_opts_show = true;
#ifdef CONFIG_BLOCK
if (fc->fs_type == &fuseblk_fs_type) {
return 0;
}
-static void fuse_sb_destroy(struct super_block *sb)
+bool fuse_mount_remove(struct fuse_mount *fm)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_conn *fc = fm->fc;
+ bool last = false;
- if (fc) {
- if (fc->destroy)
- fuse_send_destroy(fc);
+ down_write(&fc->killsb);
+ list_del_init(&fm->fc_entry);
+ if (list_empty(&fc->mounts))
+ last = true;
+ up_write(&fc->killsb);
- fuse_abort_conn(fc);
- fuse_wait_aborted(fc);
+ return last;
+}
+EXPORT_SYMBOL_GPL(fuse_mount_remove);
- down_write(&fc->killsb);
- fc->sb = NULL;
- up_write(&fc->killsb);
+void fuse_conn_destroy(struct fuse_mount *fm)
+{
+ struct fuse_conn *fc = fm->fc;
+
+ if (fc->destroy)
+ fuse_send_destroy(fm);
+
+ fuse_abort_conn(fc);
+ fuse_wait_aborted(fc);
+
+ if (!list_empty(&fc->entry)) {
+ mutex_lock(&fuse_mutex);
+ list_del(&fc->entry);
+ fuse_ctl_remove_conn(fc);
+ mutex_unlock(&fuse_mutex);
}
}
+EXPORT_SYMBOL_GPL(fuse_conn_destroy);
-void fuse_kill_sb_anon(struct super_block *sb)
+static void fuse_kill_sb_anon(struct super_block *sb)
{
- fuse_sb_destroy(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ bool last;
+
+ if (fm) {
+ last = fuse_mount_remove(fm);
+ if (last)
+ fuse_conn_destroy(fm);
+ }
kill_anon_super(sb);
}
-EXPORT_SYMBOL_GPL(fuse_kill_sb_anon);
static struct file_system_type fuse_fs_type = {
.owner = THIS_MODULE,
#ifdef CONFIG_BLOCK
static void fuse_kill_sb_blk(struct super_block *sb)
{
- fuse_sb_destroy(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ bool last;
+
+ if (fm) {
+ last = fuse_mount_remove(fm);
+ if (last)
+ fuse_conn_destroy(fm);
+ }
kill_block_super(sb);
}
static void fuse_force_forget(struct file *file, u64 nodeid)
{
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_forget_in inarg;
FUSE_ARGS(args);
args.force = true;
args.noreply = true;
- fuse_simple_request(fc, &args);
+ fuse_simple_request(fm, &args);
/* ignore errors */
}
ssize_t res;
struct page *page;
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_io_args ia = {};
struct fuse_args_pages *ap = &ia.ap;
struct fuse_page_desc desc = { .length = PAGE_SIZE };
ap->pages = &page;
ap->descs = &desc;
if (plus) {
- attr_version = fuse_get_attr_version(fc);
+ attr_version = fuse_get_attr_version(fm->fc);
fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
FUSE_READDIRPLUS);
} else {
FUSE_READDIR);
}
locked = fuse_lock_inode(inode);
- res = fuse_simple_request(fc, &ap->args);
+ res = fuse_simple_request(fm, &ap->args);
fuse_unlock_inode(inode, locked);
if (res >= 0) {
if (!res) {
*/
#include <linux/fs.h>
+#include <linux/dax.h>
+#include <linux/pci.h>
+#include <linux/pfn_t.h>
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_fs.h>
#include <linux/delay.h>
#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/highmem.h>
+#include <linux/uio.h>
#include "fuse_i.h"
/* List of virtio-fs device instances and a lock for the list. Also provides
VQ_REQUEST
};
+#define VQ_NAME_LEN 24
+
/* Per-virtqueue state */
struct virtio_fs_vq {
spinlock_t lock;
bool connected;
long in_flight;
struct completion in_flight_zero; /* No inflight requests */
- char name[24];
+ char name[VQ_NAME_LEN];
} ____cacheline_aligned_in_smp;
/* A virtio-fs device instance */
struct virtio_fs_vq *vqs;
unsigned int nvqs; /* number of virtqueues */
unsigned int num_request_queues; /* number of request queues */
+ struct dax_device *dax_dev;
+
+ /* DAX memory window where file contents are mapped */
+ void *window_kaddr;
+ phys_addr_t window_phys_addr;
+ size_t window_len;
};
struct virtio_fs_forget_req {
static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
struct fuse_req *req, bool in_flight);
+enum {
+ OPT_DAX,
+};
+
+static const struct fs_parameter_spec virtio_fs_parameters[] = {
+ fsparam_flag("dax", OPT_DAX),
+ {}
+};
+
+static int virtio_fs_parse_param(struct fs_context *fc,
+ struct fs_parameter *param)
+{
+ struct fs_parse_result result;
+ struct fuse_fs_context *ctx = fc->fs_private;
+ int opt;
+
+ opt = fs_parse(fc, virtio_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case OPT_DAX:
+ ctx->dax = 1;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void virtio_fs_free_fc(struct fs_context *fc)
+{
+ struct fuse_fs_context *ctx = fc->fs_private;
+
+ kfree(ctx);
+}
+
static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq)
{
struct virtio_fs *fs = vq->vdev->priv;
struct fuse_req *req;
struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
dispatch_work.work);
- struct fuse_conn *fc = fsvq->fud->fc;
int ret;
pr_debug("virtio-fs: worker %s called.\n", __func__);
list_del_init(&req->list);
spin_unlock(&fsvq->lock);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
}
/* Dispatch pending requests */
spin_unlock(&fsvq->lock);
pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n",
ret);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
}
}
}
struct virtio_fs_vq *fsvq)
{
struct fuse_pqueue *fpq = &fsvq->fud->pq;
- struct fuse_conn *fc = fsvq->fud->fc;
struct fuse_args *args;
struct fuse_args_pages *ap;
unsigned int len, i, thislen;
clear_bit(FR_SENT, &req->flags);
spin_unlock(&fpq->lock);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
spin_lock(&fsvq->lock);
dec_in_flight_req(fsvq);
spin_unlock(&fsvq->lock);
schedule_work(&fsvq->done_work);
}
+static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name,
+ int vq_type)
+{
+ strncpy(fsvq->name, name, VQ_NAME_LEN);
+ spin_lock_init(&fsvq->lock);
+ INIT_LIST_HEAD(&fsvq->queued_reqs);
+ INIT_LIST_HEAD(&fsvq->end_reqs);
+ init_completion(&fsvq->in_flight_zero);
+
+ if (vq_type == VQ_REQUEST) {
+ INIT_WORK(&fsvq->done_work, virtio_fs_requests_done_work);
+ INIT_DELAYED_WORK(&fsvq->dispatch_work,
+ virtio_fs_request_dispatch_work);
+ } else {
+ INIT_WORK(&fsvq->done_work, virtio_fs_hiprio_done_work);
+ INIT_DELAYED_WORK(&fsvq->dispatch_work,
+ virtio_fs_hiprio_dispatch_work);
+ }
+}
+
/* Initialize virtqueues */
static int virtio_fs_setup_vqs(struct virtio_device *vdev,
struct virtio_fs *fs)
if (fs->num_request_queues == 0)
return -EINVAL;
- fs->nvqs = 1 + fs->num_request_queues;
+ fs->nvqs = VQ_REQUEST + fs->num_request_queues;
fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL);
if (!fs->vqs)
return -ENOMEM;
goto out;
}
+ /* Initialize the hiprio/forget request virtqueue */
callbacks[VQ_HIPRIO] = virtio_fs_vq_done;
- snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name),
- "hiprio");
+ virtio_fs_init_vq(&fs->vqs[VQ_HIPRIO], "hiprio", VQ_HIPRIO);
names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name;
- INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work);
- INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs);
- INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].end_reqs);
- INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work,
- virtio_fs_hiprio_dispatch_work);
- init_completion(&fs->vqs[VQ_HIPRIO].in_flight_zero);
- spin_lock_init(&fs->vqs[VQ_HIPRIO].lock);
/* Initialize the requests virtqueues */
for (i = VQ_REQUEST; i < fs->nvqs; i++) {
- spin_lock_init(&fs->vqs[i].lock);
- INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work);
- INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work,
- virtio_fs_request_dispatch_work);
- INIT_LIST_HEAD(&fs->vqs[i].queued_reqs);
- INIT_LIST_HEAD(&fs->vqs[i].end_reqs);
- init_completion(&fs->vqs[i].in_flight_zero);
- snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name),
- "requests.%u", i - VQ_REQUEST);
+ char vq_name[VQ_NAME_LEN];
+
+ snprintf(vq_name, VQ_NAME_LEN, "requests.%u", i - VQ_REQUEST);
+ virtio_fs_init_vq(&fs->vqs[i], vq_name, VQ_REQUEST);
callbacks[i] = virtio_fs_vq_done;
names[i] = fs->vqs[i].name;
}
vdev->config->del_vqs(vdev);
}
+/* Map a window offset to a page frame number. The window offset will have
+ * been produced by .iomap_begin(), which maps a file offset to a window
+ * offset.
+ */
+static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
+ long nr_pages, void **kaddr, pfn_t *pfn)
+{
+ struct virtio_fs *fs = dax_get_private(dax_dev);
+ phys_addr_t offset = PFN_PHYS(pgoff);
+ size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff;
+
+ if (kaddr)
+ *kaddr = fs->window_kaddr + offset;
+ if (pfn)
+ *pfn = phys_to_pfn_t(fs->window_phys_addr + offset,
+ PFN_DEV | PFN_MAP);
+ return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
+}
+
+static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev,
+ pgoff_t pgoff, void *addr,
+ size_t bytes, struct iov_iter *i)
+{
+ return copy_from_iter(addr, bytes, i);
+}
+
+static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev,
+ pgoff_t pgoff, void *addr,
+ size_t bytes, struct iov_iter *i)
+{
+ return copy_to_iter(addr, bytes, i);
+}
+
+static int virtio_fs_zero_page_range(struct dax_device *dax_dev,
+ pgoff_t pgoff, size_t nr_pages)
+{
+ long rc;
+ void *kaddr;
+
+ rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL);
+ if (rc < 0)
+ return rc;
+ memset(kaddr, 0, nr_pages << PAGE_SHIFT);
+ dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT);
+ return 0;
+}
+
+static const struct dax_operations virtio_fs_dax_ops = {
+ .direct_access = virtio_fs_direct_access,
+ .copy_from_iter = virtio_fs_copy_from_iter,
+ .copy_to_iter = virtio_fs_copy_to_iter,
+ .zero_page_range = virtio_fs_zero_page_range,
+};
+
+static void virtio_fs_cleanup_dax(void *data)
+{
+ struct dax_device *dax_dev = data;
+
+ kill_dax(dax_dev);
+ put_dax(dax_dev);
+}
+
+static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
+{
+ struct virtio_shm_region cache_reg;
+ struct dev_pagemap *pgmap;
+ bool have_cache;
+
+ if (!IS_ENABLED(CONFIG_FUSE_DAX))
+ return 0;
+
+ /* Get cache region */
+ have_cache = virtio_get_shm_region(vdev, &cache_reg,
+ (u8)VIRTIO_FS_SHMCAP_ID_CACHE);
+ if (!have_cache) {
+ dev_notice(&vdev->dev, "%s: No cache capability\n", __func__);
+ return 0;
+ }
+
+ if (!devm_request_mem_region(&vdev->dev, cache_reg.addr, cache_reg.len,
+ dev_name(&vdev->dev))) {
+ dev_warn(&vdev->dev, "could not reserve region addr=0x%llx len=0x%llx\n",
+ cache_reg.addr, cache_reg.len);
+ return -EBUSY;
+ }
+
+ dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n", cache_reg.len,
+ cache_reg.addr);
+
+ pgmap = devm_kzalloc(&vdev->dev, sizeof(*pgmap), GFP_KERNEL);
+ if (!pgmap)
+ return -ENOMEM;
+
+ pgmap->type = MEMORY_DEVICE_FS_DAX;
+
+ /* Ideally we would directly use the PCI BAR resource but
+ * devm_memremap_pages() wants its own copy in pgmap. So
+ * initialize a struct resource from scratch (only the start
+ * and end fields will be used).
+ */
+ pgmap->range = (struct range) {
+ .start = (phys_addr_t) cache_reg.addr,
+ .end = (phys_addr_t) cache_reg.addr + cache_reg.len - 1,
+ };
+ pgmap->nr_range = 1;
+
+ fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap);
+ if (IS_ERR(fs->window_kaddr))
+ return PTR_ERR(fs->window_kaddr);
+
+ fs->window_phys_addr = (phys_addr_t) cache_reg.addr;
+ fs->window_len = (phys_addr_t) cache_reg.len;
+
+ dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n",
+ __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len);
+
+ fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops, 0);
+ if (IS_ERR(fs->dax_dev))
+ return PTR_ERR(fs->dax_dev);
+
+ return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax,
+ fs->dax_dev);
+}
+
static int virtio_fs_probe(struct virtio_device *vdev)
{
struct virtio_fs *fs;
/* TODO vq affinity */
+ ret = virtio_fs_setup_dax(vdev, fs);
+ if (ret < 0)
+ goto out_vqs;
+
/* Bring the device online in case the filesystem is mounted and
* requests need to be sent before we return.
*/
spin_unlock(&fiq->lock);
}
+/* Count number of scatter-gather elements required */
+static unsigned int sg_count_fuse_pages(struct fuse_page_desc *page_descs,
+ unsigned int num_pages,
+ unsigned int total_len)
+{
+ unsigned int i;
+ unsigned int this_len;
+
+ for (i = 0; i < num_pages && total_len; i++) {
+ this_len = min(page_descs[i].length, total_len);
+ total_len -= this_len;
+ }
+
+ return i;
+}
+
/* Return the number of scatter-gather list elements required */
static unsigned int sg_count_fuse_req(struct fuse_req *req)
{
struct fuse_args *args = req->args;
struct fuse_args_pages *ap = container_of(args, typeof(*ap), args);
- unsigned int total_sgs = 1 /* fuse_in_header */;
+ unsigned int size, total_sgs = 1 /* fuse_in_header */;
if (args->in_numargs - args->in_pages)
total_sgs += 1;
- if (args->in_pages)
- total_sgs += ap->num_pages;
+ if (args->in_pages) {
+ size = args->in_args[args->in_numargs - 1].size;
+ total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages,
+ size);
+ }
if (!test_bit(FR_ISREPLY, &req->flags))
return total_sgs;
if (args->out_numargs - args->out_pages)
total_sgs += 1;
- if (args->out_pages)
- total_sgs += ap->num_pages;
+ if (args->out_pages) {
+ size = args->out_args[args->out_numargs - 1].size;
+ total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages,
+ size);
+ }
return total_sgs;
}
.release = virtio_fs_fiq_release,
};
-static int virtio_fs_fill_super(struct super_block *sb)
+static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ ctx->rootmode = S_IFDIR;
+ ctx->default_permissions = 1;
+ ctx->allow_other = 1;
+ ctx->max_read = UINT_MAX;
+ ctx->blksize = 512;
+ ctx->destroy = true;
+ ctx->no_control = true;
+ ctx->no_force_umount = true;
+}
+
+static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ struct fuse_conn *fc = fm->fc;
struct virtio_fs *fs = fc->iq.priv;
+ struct fuse_fs_context *ctx = fsc->fs_private;
unsigned int i;
int err;
- struct fuse_fs_context ctx = {
- .rootmode = S_IFDIR,
- .default_permissions = 1,
- .allow_other = 1,
- .max_read = UINT_MAX,
- .blksize = 512,
- .destroy = true,
- .no_control = true,
- .no_force_umount = true,
- .no_mount_options = true,
- };
+ virtio_fs_ctx_set_defaults(ctx);
mutex_lock(&virtio_fs_mutex);
/* After holding mutex, make sure virtiofs device is still there.
}
/* virtiofs allocates and installs its own fuse devices */
- ctx.fudptr = NULL;
- err = fuse_fill_super_common(sb, &ctx);
+ ctx->fudptr = NULL;
+ if (ctx->dax)
+ ctx->dax_dev = fs->dax_dev;
+ err = fuse_fill_super_common(sb, ctx);
if (err < 0)
goto err_free_fuse_devs;
/* Previous unmount will stop all queues. Start these again */
virtio_fs_start_all_queues(fs);
- fuse_send_init(fc);
+ fuse_send_init(fm);
mutex_unlock(&virtio_fs_mutex);
return 0;
return err;
}
-static void virtio_kill_sb(struct super_block *sb)
+static void virtio_fs_conn_destroy(struct fuse_mount *fm)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
- struct virtio_fs *vfs;
- struct virtio_fs_vq *fsvq;
-
- /* If mount failed, we can still be called without any fc */
- if (!fc)
- return fuse_kill_sb_anon(sb);
+ struct fuse_conn *fc = fm->fc;
+ struct virtio_fs *vfs = fc->iq.priv;
+ struct virtio_fs_vq *fsvq = &vfs->vqs[VQ_HIPRIO];
- vfs = fc->iq.priv;
- fsvq = &vfs->vqs[VQ_HIPRIO];
+ /* Stop dax worker. Soon evict_inodes() will be called which
+ * will free all memory ranges belonging to all inodes.
+ */
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_cancel_work(fc);
/* Stop forget queue. Soon destroy will be sent */
spin_lock(&fsvq->lock);
spin_unlock(&fsvq->lock);
virtio_fs_drain_all_queues(vfs);
- fuse_kill_sb_anon(sb);
+ fuse_conn_destroy(fm);
- /* fuse_kill_sb_anon() must have sent destroy. Stop all queues
+ /* fuse_conn_destroy() must have sent destroy. Stop all queues
* and drain one more time and free fuse devices. Freeing fuse
* devices will drop their reference on fuse_conn and that in
* turn will drop its reference on virtio_fs object.
virtio_fs_free_devs(vfs);
}
+static void virtio_kill_sb(struct super_block *sb)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ bool last;
+
+ /* If mount failed, we can still be called without any fc */
+ if (fm) {
+ last = fuse_mount_remove(fm);
+ if (last)
+ virtio_fs_conn_destroy(fm);
+ }
+ kill_anon_super(sb);
+}
+
static int virtio_fs_test_super(struct super_block *sb,
struct fs_context *fsc)
{
- struct fuse_conn *fc = fsc->s_fs_info;
+ struct fuse_mount *fsc_fm = fsc->s_fs_info;
+ struct fuse_mount *sb_fm = get_fuse_mount_super(sb);
- return fc->iq.priv == get_fuse_conn_super(sb)->iq.priv;
+ return fsc_fm->fc->iq.priv == sb_fm->fc->iq.priv;
}
static int virtio_fs_set_super(struct super_block *sb,
err = get_anon_bdev(&sb->s_dev);
if (!err)
- fuse_conn_get(fsc->s_fs_info);
+ fuse_mount_get(fsc->s_fs_info);
return err;
}
struct virtio_fs *fs;
struct super_block *sb;
struct fuse_conn *fc;
+ struct fuse_mount *fm;
int err;
/* This gets a reference on virtio_fs object. This ptr gets installed
return -ENOMEM;
}
- fuse_conn_init(fc, get_user_ns(current_user_ns()), &virtio_fs_fiq_ops,
- fs);
+ fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL);
+ if (!fm) {
+ mutex_lock(&virtio_fs_mutex);
+ virtio_fs_put(fs);
+ mutex_unlock(&virtio_fs_mutex);
+ kfree(fc);
+ return -ENOMEM;
+ }
+
+ fuse_conn_init(fc, fm, get_user_ns(current_user_ns()),
+ &virtio_fs_fiq_ops, fs);
fc->release = fuse_free_conn;
fc->delete_stale = true;
+ fc->auto_submounts = true;
- fsc->s_fs_info = fc;
+ fsc->s_fs_info = fm;
sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super);
- fuse_conn_put(fc);
+ fuse_mount_put(fm);
if (IS_ERR(sb))
return PTR_ERR(sb);
if (!sb->s_root) {
- err = virtio_fs_fill_super(sb);
+ err = virtio_fs_fill_super(sb, fsc);
if (err) {
deactivate_locked_super(sb);
return err;
}
static const struct fs_context_operations virtio_fs_context_ops = {
+ .free = virtio_fs_free_fc,
+ .parse_param = virtio_fs_parse_param,
.get_tree = virtio_fs_get_tree,
};
static int virtio_fs_init_fs_context(struct fs_context *fsc)
{
+ struct fuse_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ fsc->fs_private = ctx;
fsc->ops = &virtio_fs_context_ops;
return 0;
}
int fuse_setxattr(struct inode *inode, const char *name, const void *value,
size_t size, int flags)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_setxattr_in inarg;
int err;
- if (fc->no_setxattr)
+ if (fm->fc->no_setxattr)
return -EOPNOTSUPP;
memset(&inarg, 0, sizeof(inarg));
args.in_args[1].value = name;
args.in_args[2].size = size;
args.in_args[2].value = value;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_setxattr = 1;
+ fm->fc->no_setxattr = 1;
err = -EOPNOTSUPP;
}
if (!err) {
ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
size_t size)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_getxattr_in inarg;
struct fuse_getxattr_out outarg;
ssize_t ret;
- if (fc->no_getxattr)
+ if (fm->fc->no_getxattr)
return -EOPNOTSUPP;
memset(&inarg, 0, sizeof(inarg));
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
}
- ret = fuse_simple_request(fc, &args);
+ ret = fuse_simple_request(fm, &args);
if (!ret && !size)
ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX);
if (ret == -ENOSYS) {
- fc->no_getxattr = 1;
+ fm->fc->no_getxattr = 1;
ret = -EOPNOTSUPP;
}
return ret;
ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
{
struct inode *inode = d_inode(entry);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_getxattr_in inarg;
struct fuse_getxattr_out outarg;
ssize_t ret;
- if (!fuse_allow_current_process(fc))
+ if (!fuse_allow_current_process(fm->fc))
return -EACCES;
- if (fc->no_listxattr)
+ if (fm->fc->no_listxattr)
return -EOPNOTSUPP;
memset(&inarg, 0, sizeof(inarg));
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
}
- ret = fuse_simple_request(fc, &args);
+ ret = fuse_simple_request(fm, &args);
if (!ret && !size)
ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX);
if (ret > 0 && size)
ret = fuse_verify_xattr_list(list, ret);
if (ret == -ENOSYS) {
- fc->no_listxattr = 1;
+ fm->fc->no_listxattr = 1;
ret = -EOPNOTSUPP;
}
return ret;
int fuse_removexattr(struct inode *inode, const char *name)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
int err;
- if (fc->no_removexattr)
+ if (fm->fc->no_removexattr)
return -EOPNOTSUPP;
args.opcode = FUSE_REMOVEXATTR;
args.in_numargs = 1;
args.in_args[0].size = strlen(name) + 1;
args.in_args[0].value = name;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_removexattr = 1;
+ fm->fc->no_removexattr = 1;
err = -EOPNOTSUPP;
}
if (!err) {
struct dax_device *dax_dev, struct writeback_control *wbc);
struct page *dax_layout_busy_page(struct address_space *mapping);
+struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
dax_entry_t dax_lock_page(struct page *page);
void dax_unlock_page(struct page *page, dax_entry_t cookie);
#else
return NULL;
}
+static inline struct page *dax_layout_busy_page_range(struct address_space *mapping, pgoff_t start, pgoff_t nr_pages)
+{
+ return NULL;
+}
+
static inline int dax_writeback_mapping_range(struct address_space *mapping,
struct dax_device *dax_dev, struct writeback_control *wbc)
{
* - add FUSE_WRITE_KILL_PRIV flag
* - add FUSE_SETUPMAPPING and FUSE_REMOVEMAPPING
* - add map_alignment to fuse_init_out, add FUSE_MAP_ALIGNMENT flag
+ *
+ * 7.32
+ * - add flags to fuse_attr, add FUSE_ATTR_SUBMOUNT, add FUSE_SUBMOUNTS
*/
#ifndef _LINUX_FUSE_H
#define FUSE_KERNEL_VERSION 7
/** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 31
+#define FUSE_KERNEL_MINOR_VERSION 32
/** The node ID of the root inode */
#define FUSE_ROOT_ID 1
uint32_t gid;
uint32_t rdev;
uint32_t blksize;
- uint32_t padding;
+ uint32_t flags;
};
struct fuse_kstatfs {
* FUSE_CACHE_SYMLINKS: cache READLINK responses
* FUSE_NO_OPENDIR_SUPPORT: kernel supports zero-message opendir
* FUSE_EXPLICIT_INVAL_DATA: only invalidate cached pages on explicit request
- * FUSE_MAP_ALIGNMENT: map_alignment field is valid
+ * FUSE_MAP_ALIGNMENT: init_out.map_alignment contains log2(byte alignment) for
+ * foffset and moffset fields in struct
+ * fuse_setupmapping_out and fuse_removemapping_one.
+ * FUSE_SUBMOUNTS: kernel supports auto-mounting directory submounts
*/
#define FUSE_ASYNC_READ (1 << 0)
#define FUSE_POSIX_LOCKS (1 << 1)
#define FUSE_NO_OPENDIR_SUPPORT (1 << 24)
#define FUSE_EXPLICIT_INVAL_DATA (1 << 25)
#define FUSE_MAP_ALIGNMENT (1 << 26)
+#define FUSE_SUBMOUNTS (1 << 27)
/**
* CUSE INIT request/reply flags
*/
#define FUSE_FSYNC_FDATASYNC (1 << 0)
+/**
+ * fuse_attr flags
+ *
+ * FUSE_ATTR_SUBMOUNT: Object is a submount root
+ */
+#define FUSE_ATTR_SUBMOUNT (1 << 0)
+
enum fuse_opcode {
FUSE_LOOKUP = 1,
FUSE_FORGET = 2, /* no reply */
uint64_t flags;
};
+#define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0)
+#define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1)
+struct fuse_setupmapping_in {
+ /* An already open handle */
+ uint64_t fh;
+ /* Offset into the file to start the mapping */
+ uint64_t foffset;
+ /* Length of mapping required */
+ uint64_t len;
+ /* Flags, FUSE_SETUPMAPPING_FLAG_* */
+ uint64_t flags;
+ /* Offset in Memory Window */
+ uint64_t moffset;
+};
+
+struct fuse_removemapping_in {
+ /* number of fuse_removemapping_one follows */
+ uint32_t count;
+};
+
+struct fuse_removemapping_one {
+ /* Offset into the dax window start the unmapping */
+ uint64_t moffset;
+ /* Length of mapping required */
+ uint64_t len;
+};
+
+#define FUSE_REMOVEMAPPING_MAX_ENTRY \
+ (PAGE_SIZE / sizeof(struct fuse_removemapping_one))
+
#endif /* _LINUX_FUSE_H */
__le32 num_request_queues;
} __attribute__((packed));
+/* For the id field in virtio_pci_shm_cap */
+#define VIRTIO_FS_SHMCAP_ID_CACHE 0
+
#endif /* _UAPI_LINUX_VIRTIO_FS_H */