----------------------
This describes how the VFS can manipulate an open file. As of kernel
-4.1, the following members are defined:
+4.18, the following members are defined:
struct file_operations {
struct module *owner;
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iterate) (struct file *, struct dir_context *);
+ int (*iterate_shared) (struct file *, struct dir_context *);
__poll_t (*poll) (struct file *, struct poll_table_struct *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
- int (*mremap)(struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *, fl_owner_t id);
int (*release) (struct inode *, struct file *);
#ifndef CONFIG_MMU
unsigned (*mmap_capabilities)(struct file *);
#endif
+ ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int);
+ int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, u64);
+ int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, u64);
+ int (*fadvise)(struct file *, loff_t, loff_t, int);
};
Again, all methods are called without any locks being held, unless
iterate: called when the VFS needs to read the directory contents
+ iterate_shared: called when the VFS needs to read the directory contents
+ when filesystem supports concurrent dir iterators
+
poll: called by the VFS when a process wants to check if there is
activity on this file and (optionally) go to sleep until there
is activity. Called by the select(2) and poll(2) system calls
fallocate: called by the VFS to preallocate blocks or punch a hole.
+ copy_file_range: called by the copy_file_range(2) system call.
+
+ clone_file_range: called by the ioctl(2) system call for FICLONERANGE and
+ FICLONE commands.
+
+ dedupe_file_range: called by the ioctl(2) system call for FIDEDUPERANGE
+ command.
+
+ fadvise: possibly called by the fadvise64() system call.
+
Note that the file operations are implemented by the specific
filesystem in which the inode resides. When opening a device node
(character or block special) most filesystems will call special
if (IS_ERR(realfile))
return PTR_ERR(realfile);
- /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */
- file->f_mapping = realfile->f_mapping;
-
file->private_data = realfile;
return 0;
return ret;
}
+static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
+{
+ struct fd real;
+ const struct cred *old_cred;
+ int ret;
+
+ ret = ovl_real_fdget(file, &real);
+ if (ret)
+ return ret;
+
+ old_cred = ovl_override_creds(file_inode(file)->i_sb);
+ ret = vfs_fadvise(real.file, offset, len, advice);
+ revert_creds(old_cred);
+
+ fdput(real);
+
+ return ret;
+}
+
static long ovl_real_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
.fsync = ovl_fsync,
.mmap = ovl_mmap,
.fallocate = ovl_fallocate,
+ .fadvise = ovl_fadvise,
.unlocked_ioctl = ovl_ioctl,
.compat_ioctl = ovl_compat_ioctl,
return -EOPNOTSUPP;
old_cred = ovl_override_creds(inode->i_sb);
+
+ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC)
+ filemap_write_and_wait(realinode->i_mapping);
+
err = realinode->i_op->fiemap(realinode, fieinfo, start, len);
revert_creds(old_cred);
.update_time = ovl_update_time,
};
+const struct address_space_operations ovl_aops = {
+ /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */
+ .direct_IO = noop_direct_IO,
+};
+
/*
* It is possible to stack overlayfs instance on top of another
* overlayfs instance as lower layer. We need to annonate the
case S_IFREG:
inode->i_op = &ovl_file_inode_operations;
inode->i_fop = &ovl_file_operations;
+ inode->i_mapping->a_ops = &ovl_aops;
break;
case S_IFDIR:
if (err)
goto out;
- err = -EBUSY;
- if (ovl_inuse_trylock(upperpath->dentry)) {
- ofs->upperdir_locked = true;
- } else if (ofs->config.index) {
- pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n");
- goto out;
- } else {
- pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
- }
-
upper_mnt = clone_private_mount(upperpath);
err = PTR_ERR(upper_mnt);
if (IS_ERR(upper_mnt)) {
/* Don't inherit atime flags */
upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME);
ofs->upper_mnt = upper_mnt;
+
+ err = -EBUSY;
+ if (ovl_inuse_trylock(ofs->upper_mnt->mnt_root)) {
+ ofs->upperdir_locked = true;
+ } else if (ofs->config.index) {
+ pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n");
+ goto out;
+ } else {
+ pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
+ }
+
err = 0;
out:
return err;
goto out;
}
+ ofs->workbasedir = dget(workpath.dentry);
+
err = -EBUSY;
- if (ovl_inuse_trylock(workpath.dentry)) {
+ if (ovl_inuse_trylock(ofs->workbasedir)) {
ofs->workdir_locked = true;
} else if (ofs->config.index) {
pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n");
pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
}
- ofs->workbasedir = dget(workpath.dentry);
err = ovl_make_workdir(ofs, &workpath);
if (err)
goto out;
u64);
int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t,
u64);
+ int (*fadvise)(struct file *, loff_t, loff_t, int);
} __randomize_layout;
struct inode_operations {
extern bool path_noexec(const struct path *path);
extern void inode_nohighmem(struct inode *inode);
+/* mm/fadvise.c */
+extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
+ int advice);
+
#endif /* _LINUX_FS_H */
mmu-$(CONFIG_MMU) += process_vm_access.o
endif
-obj-y := filemap.o mempool.o oom_kill.o \
+obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
obj-y += bootmem.o
endif
-obj-$(CONFIG_ADVISE_SYSCALLS) += fadvise.o
ifdef CONFIG_MMU
obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o
endif
* deactivate the pages and clear PG_Referenced.
*/
-int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
+static int generic_fadvise(struct file *file, loff_t offset, loff_t len,
+ int advice)
{
- struct fd f = fdget(fd);
struct inode *inode;
struct address_space *mapping;
struct backing_dev_info *bdi;
pgoff_t start_index;
pgoff_t end_index;
unsigned long nrpages;
- int ret = 0;
-
- if (!f.file)
- return -EBADF;
- inode = file_inode(f.file);
- if (S_ISFIFO(inode->i_mode)) {
- ret = -ESPIPE;
- goto out;
- }
+ inode = file_inode(file);
+ if (S_ISFIFO(inode->i_mode))
+ return -ESPIPE;
- mapping = f.file->f_mapping;
- if (!mapping || len < 0) {
- ret = -EINVAL;
- goto out;
- }
+ mapping = file->f_mapping;
+ if (!mapping || len < 0)
+ return -EINVAL;
bdi = inode_to_bdi(mapping->host);
/* no bad return value, but ignore advice */
break;
default:
- ret = -EINVAL;
+ return -EINVAL;
}
- goto out;
+ return 0;
}
/*
switch (advice) {
case POSIX_FADV_NORMAL:
- f.file->f_ra.ra_pages = bdi->ra_pages;
- spin_lock(&f.file->f_lock);
- f.file->f_mode &= ~FMODE_RANDOM;
- spin_unlock(&f.file->f_lock);
+ file->f_ra.ra_pages = bdi->ra_pages;
+ spin_lock(&file->f_lock);
+ file->f_mode &= ~FMODE_RANDOM;
+ spin_unlock(&file->f_lock);
break;
case POSIX_FADV_RANDOM:
- spin_lock(&f.file->f_lock);
- f.file->f_mode |= FMODE_RANDOM;
- spin_unlock(&f.file->f_lock);
+ spin_lock(&file->f_lock);
+ file->f_mode |= FMODE_RANDOM;
+ spin_unlock(&file->f_lock);
break;
case POSIX_FADV_SEQUENTIAL:
- f.file->f_ra.ra_pages = bdi->ra_pages * 2;
- spin_lock(&f.file->f_lock);
- f.file->f_mode &= ~FMODE_RANDOM;
- spin_unlock(&f.file->f_lock);
+ file->f_ra.ra_pages = bdi->ra_pages * 2;
+ spin_lock(&file->f_lock);
+ file->f_mode &= ~FMODE_RANDOM;
+ spin_unlock(&file->f_lock);
break;
case POSIX_FADV_WILLNEED:
/* First and last PARTIAL page! */
* Ignore return value because fadvise() shall return
* success even if filesystem can't retrieve a hint,
*/
- force_page_cache_readahead(mapping, f.file, start_index,
- nrpages);
+ force_page_cache_readahead(mapping, file, start_index, nrpages);
break;
case POSIX_FADV_NOREUSE:
break;
}
break;
default:
- ret = -EINVAL;
+ return -EINVAL;
}
-out:
+ return 0;
+}
+
+int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
+{
+ if (file->f_op->fadvise)
+ return file->f_op->fadvise(file, offset, len, advice);
+
+ return generic_fadvise(file, offset, len, advice);
+}
+EXPORT_SYMBOL(vfs_fadvise);
+
+#ifdef CONFIG_ADVISE_SYSCALLS
+
+int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
+{
+ struct fd f = fdget(fd);
+ int ret;
+
+ if (!f.file)
+ return -EBADF;
+
+ ret = vfs_fadvise(f.file, offset, len, advice);
+
fdput(f);
return ret;
}
}
#endif
+#endif
#include <linux/file.h>
#include <linux/mm_inline.h>
#include <linux/blk-cgroup.h>
+#include <linux/fadvise.h>
#include "internal.h"
}
EXPORT_SYMBOL_GPL(page_cache_async_readahead);
-static ssize_t
-do_readahead(struct address_space *mapping, struct file *filp,
- pgoff_t index, unsigned long nr)
-{
- if (!mapping || !mapping->a_ops)
- return -EINVAL;
-
- /*
- * Readahead doesn't make sense for DAX inodes, but we don't want it
- * to report a failure either. Instead, we just return success and
- * don't do any work.
- */
- if (dax_mapping(mapping))
- return 0;
-
- return force_page_cache_readahead(mapping, filp, index, nr);
-}
-
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
ssize_t ret;
ret = -EBADF;
f = fdget(fd);
- if (f.file) {
- if (f.file->f_mode & FMODE_READ) {
- struct address_space *mapping = f.file->f_mapping;
- pgoff_t start = offset >> PAGE_SHIFT;
- pgoff_t end = (offset + count - 1) >> PAGE_SHIFT;
- unsigned long len = end - start + 1;
- ret = do_readahead(mapping, f.file, start, len);
- }
- fdput(f);
- }
+ if (!f.file || !(f.file->f_mode & FMODE_READ))
+ goto out;
+
+ /*
+ * The readahead() syscall is intended to run only on files
+ * that can execute readahead. If readahead is not possible
+ * on this file, then we must return -EINVAL.
+ */
+ ret = -EINVAL;
+ if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
+ !S_ISREG(file_inode(f.file)->i_mode))
+ goto out;
+
+ ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
+out:
+ fdput(f);
return ret;
}