Merge tag 'ovl-fixes-4.19-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/mszer...

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 14 Sep 2018 05:21:40 +0000 (19:21 -1000)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 14 Sep 2018 05:21:40 +0000 (19:21 -1000)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Sep 2018 05:21:40 +0000 (19:21 -1000)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Sep 2018 05:21:40 +0000 (19:21 -1000)
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt

index 4b2084d..a6c6a8a 100644 (file)
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -848,7 +848,7 @@ struct file_operations
  ----------------------
  
  This describes how the VFS can manipulate an open file. As of kernel
-4.1, the following members are defined:
+4.18, the following members are defined:
  
  struct file_operations {
         struct module *owner;
@@ -858,11 +858,11 @@ struct file_operations {
         ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
         ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
         int (*iterate) (struct file *, struct dir_context *);
+       int (*iterate_shared) (struct file *, struct dir_context *);
         __poll_t (*poll) (struct file *, struct poll_table_struct *);
         long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
         long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
         int (*mmap) (struct file *, struct vm_area_struct *);
-       int (*mremap)(struct file *, struct vm_area_struct *);
         int (*open) (struct inode *, struct file *);
         int (*flush) (struct file *, fl_owner_t id);
         int (*release) (struct inode *, struct file *);
@@ -882,6 +882,10 @@ struct file_operations {
  #ifndef CONFIG_MMU
         unsigned (*mmap_capabilities)(struct file *);
  #endif
+       ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int);
+       int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, u64);
+       int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, u64);
+       int (*fadvise)(struct file *, loff_t, loff_t, int);
  };
  
  Again, all methods are called without any locks being held, unless
@@ -899,6 +903,9 @@ otherwise noted.
  
    iterate: called when the VFS needs to read the directory contents
  
+  iterate_shared: called when the VFS needs to read the directory contents
+       when filesystem supports concurrent dir iterators
+
    poll: called by the VFS when a process wants to check if there is
         activity on this file and (optionally) go to sleep until there
         is activity. Called by the select(2) and poll(2) system calls
@@ -951,6 +958,16 @@ otherwise noted.
  
    fallocate: called by the VFS to preallocate blocks or punch a hole.
  
+  copy_file_range: called by the copy_file_range(2) system call.
+
+  clone_file_range: called by the ioctl(2) system call for FICLONERANGE and
+       FICLONE commands.
+
+  dedupe_file_range: called by the ioctl(2) system call for FIDEDUPERANGE
+       command.
+
+  fadvise: possibly called by the fadvise64() system call.
+
  Note that the file operations are implemented by the specific
  filesystem in which the inode resides. When opening a device node
  (character or block special) most filesystems will call special
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c

index 32e9282..aeaefd2 100644 (file)
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -131,9 +131,6 @@ static int ovl_open(struct inode *inode, struct file *file)
         if (IS_ERR(realfile))
                 return PTR_ERR(realfile);
  
-       /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */
-       file->f_mapping = realfile->f_mapping;
-
         file->private_data = realfile;
  
         return 0;
@@ -334,6 +331,25 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
         return ret;
  }
  
+static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
+{
+       struct fd real;
+       const struct cred *old_cred;
+       int ret;
+
+       ret = ovl_real_fdget(file, &real);
+       if (ret)
+               return ret;
+
+       old_cred = ovl_override_creds(file_inode(file)->i_sb);
+       ret = vfs_fadvise(real.file, offset, len, advice);
+       revert_creds(old_cred);
+
+       fdput(real);
+
+       return ret;
+}
+
  static long ovl_real_ioctl(struct file *file, unsigned int cmd,
                            unsigned long arg)
  {
@@ -502,6 +518,7 @@ const struct file_operations ovl_file_operations = {
         .fsync          = ovl_fsync,
         .mmap           = ovl_mmap,
         .fallocate      = ovl_fallocate,
+       .fadvise        = ovl_fadvise,
         .unlocked_ioctl = ovl_ioctl,
         .compat_ioctl   = ovl_compat_ioctl,
  
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c

index e0bb217..b6ac545 100644 (file)
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -467,6 +467,10 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                 return -EOPNOTSUPP;
  
         old_cred = ovl_override_creds(inode->i_sb);
+
+       if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC)
+               filemap_write_and_wait(realinode->i_mapping);
+
         err = realinode->i_op->fiemap(realinode, fieinfo, start, len);
         revert_creds(old_cred);
  
@@ -500,6 +504,11 @@ static const struct inode_operations ovl_special_inode_operations = {
         .update_time    = ovl_update_time,
  };
  
+const struct address_space_operations ovl_aops = {
+       /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */
+       .direct_IO              = noop_direct_IO,
+};
+
  /*
   * It is possible to stack overlayfs instance on top of another
   * overlayfs instance as lower layer. We need to annonate the
@@ -571,6 +580,7 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev,
         case S_IFREG:
                 inode->i_op = &ovl_file_inode_operations;
                 inode->i_fop = &ovl_file_operations;
+               inode->i_mapping->a_ops = &ovl_aops;
                 break;
  
         case S_IFDIR:
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c

index 2e0fc93..30adc9d 100644 (file)
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -982,16 +982,6 @@ static int ovl_get_upper(struct ovl_fs *ofs, struct path *upperpath)
         if (err)
                 goto out;
  
-       err = -EBUSY;
-       if (ovl_inuse_trylock(upperpath->dentry)) {
-               ofs->upperdir_locked = true;
-       } else if (ofs->config.index) {
-               pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n");
-               goto out;
-       } else {
-               pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
-       }
-
         upper_mnt = clone_private_mount(upperpath);
         err = PTR_ERR(upper_mnt);
         if (IS_ERR(upper_mnt)) {
@@ -1002,6 +992,17 @@ static int ovl_get_upper(struct ovl_fs *ofs, struct path *upperpath)
         /* Don't inherit atime flags */
         upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME);
         ofs->upper_mnt = upper_mnt;
+
+       err = -EBUSY;
+       if (ovl_inuse_trylock(ofs->upper_mnt->mnt_root)) {
+               ofs->upperdir_locked = true;
+       } else if (ofs->config.index) {
+               pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n");
+               goto out;
+       } else {
+               pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
+       }
+
         err = 0;
  out:
         return err;
@@ -1101,8 +1102,10 @@ static int ovl_get_workdir(struct ovl_fs *ofs, struct path *upperpath)
                 goto out;
         }
  
+       ofs->workbasedir = dget(workpath.dentry);
+
         err = -EBUSY;
-       if (ovl_inuse_trylock(workpath.dentry)) {
+       if (ovl_inuse_trylock(ofs->workbasedir)) {
                 ofs->workdir_locked = true;
         } else if (ofs->config.index) {
                 pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n");
@@ -1111,7 +1114,6 @@ static int ovl_get_workdir(struct ovl_fs *ofs, struct path *upperpath)
                 pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
         }
  
-       ofs->workbasedir = dget(workpath.dentry);
         err = ovl_make_workdir(ofs, &workpath);
         if (err)
                 goto out;
diff --git a/include/linux/fs.h b/include/linux/fs.h

index 3332270..6c0b4a1 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1763,6 +1763,7 @@ struct file_operations {
                         u64);
         int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t,
                         u64);
+       int (*fadvise)(struct file *, loff_t, loff_t, int);
  } __randomize_layout;
  
  struct inode_operations {
@@ -3459,4 +3460,8 @@ static inline bool dir_relax_shared(struct inode *inode)
  extern bool path_noexec(const struct path *path);
  extern void inode_nohighmem(struct inode *inode);
  
+/* mm/fadvise.c */
+extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
+                      int advice);
+
  #endif /* _LINUX_FS_H */
diff --git a/mm/Makefile b/mm/Makefile

index 8716bda..26ef77a 100644 (file)
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -32,7 +32,7 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH
  mmu-$(CONFIG_MMU)      += process_vm_access.o
  endif
  
-obj-y                  := filemap.o mempool.o oom_kill.o \
+obj-y                  := filemap.o mempool.o oom_kill.o fadvise.o \
                            maccess.o page_alloc.o page-writeback.o \
                            readahead.o swap.o truncate.o vmscan.o shmem.o \
                            util.o mmzone.o vmstat.o backing-dev.o \
@@ -49,7 +49,6 @@ else
         obj-y           += bootmem.o
  endif
  
-obj-$(CONFIG_ADVISE_SYSCALLS)  += fadvise.o
  ifdef CONFIG_MMU
         obj-$(CONFIG_ADVISE_SYSCALLS)   += madvise.o
  endif
diff --git a/mm/fadvise.c b/mm/fadvise.c

index 2d8376e..467bcd0 100644 (file)
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -27,9 +27,9 @@
   * deactivate the pages and clear PG_Referenced.
   */
  
-int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
+static int generic_fadvise(struct file *file, loff_t offset, loff_t len,
+                          int advice)
  {
-       struct fd f = fdget(fd);
         struct inode *inode;
         struct address_space *mapping;
         struct backing_dev_info *bdi;
@@ -37,22 +37,14 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
         pgoff_t start_index;
         pgoff_t end_index;
         unsigned long nrpages;
-       int ret = 0;
-
-       if (!f.file)
-               return -EBADF;
  
-       inode = file_inode(f.file);
-       if (S_ISFIFO(inode->i_mode)) {
-               ret = -ESPIPE;
-               goto out;
-       }
+       inode = file_inode(file);
+       if (S_ISFIFO(inode->i_mode))
+               return -ESPIPE;
  
-       mapping = f.file->f_mapping;
-       if (!mapping || len < 0) {
-               ret = -EINVAL;
-               goto out;
-       }
+       mapping = file->f_mapping;
+       if (!mapping || len < 0)
+               return -EINVAL;
  
         bdi = inode_to_bdi(mapping->host);
  
@@ -67,9 +59,9 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
                         /* no bad return value, but ignore advice */
                         break;
                 default:
-                       ret = -EINVAL;
+                       return -EINVAL;
                 }
-               goto out;
+               return 0;
         }
  
         /*
@@ -85,21 +77,21 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
  
         switch (advice) {
         case POSIX_FADV_NORMAL:
-               f.file->f_ra.ra_pages = bdi->ra_pages;
-               spin_lock(&f.file->f_lock);
-               f.file->f_mode &= ~FMODE_RANDOM;
-               spin_unlock(&f.file->f_lock);
+               file->f_ra.ra_pages = bdi->ra_pages;
+               spin_lock(&file->f_lock);
+               file->f_mode &= ~FMODE_RANDOM;
+               spin_unlock(&file->f_lock);
                 break;
         case POSIX_FADV_RANDOM:
-               spin_lock(&f.file->f_lock);
-               f.file->f_mode |= FMODE_RANDOM;
-               spin_unlock(&f.file->f_lock);
+               spin_lock(&file->f_lock);
+               file->f_mode |= FMODE_RANDOM;
+               spin_unlock(&file->f_lock);
                 break;
         case POSIX_FADV_SEQUENTIAL:
-               f.file->f_ra.ra_pages = bdi->ra_pages * 2;
-               spin_lock(&f.file->f_lock);
-               f.file->f_mode &= ~FMODE_RANDOM;
-               spin_unlock(&f.file->f_lock);
+               file->f_ra.ra_pages = bdi->ra_pages * 2;
+               spin_lock(&file->f_lock);
+               file->f_mode &= ~FMODE_RANDOM;
+               spin_unlock(&file->f_lock);
                 break;
         case POSIX_FADV_WILLNEED:
                 /* First and last PARTIAL page! */
@@ -115,8 +107,7 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
                  * Ignore return value because fadvise() shall return
                  * success even if filesystem can't retrieve a hint,
                  */
-               force_page_cache_readahead(mapping, f.file, start_index,
-                                          nrpages);
+               force_page_cache_readahead(mapping, file, start_index, nrpages);
                 break;
         case POSIX_FADV_NOREUSE:
                 break;
@@ -183,9 +174,32 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
                 }
                 break;
         default:
-               ret = -EINVAL;
+               return -EINVAL;
         }
-out:
+       return 0;
+}
+
+int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
+{
+       if (file->f_op->fadvise)
+               return file->f_op->fadvise(file, offset, len, advice);
+
+       return generic_fadvise(file, offset, len, advice);
+}
+EXPORT_SYMBOL(vfs_fadvise);
+
+#ifdef CONFIG_ADVISE_SYSCALLS
+
+int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
+{
+       struct fd f = fdget(fd);
+       int ret;
+
+       if (!f.file)
+               return -EBADF;
+
+       ret = vfs_fadvise(f.file, offset, len, advice);
+
         fdput(f);
         return ret;
  }
@@ -203,3 +217,4 @@ SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice)
  }
  
  #endif
+#endif
diff --git a/mm/readahead.c b/mm/readahead.c

index a59ea70..4e63014 100644 (file)
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -20,6 +20,7 @@
  #include <linux/file.h>
  #include <linux/mm_inline.h>
  #include <linux/blk-cgroup.h>
+#include <linux/fadvise.h>
  
  #include "internal.h"
  
@@ -575,24 +576,6 @@ page_cache_async_readahead(struct address_space *mapping,
  }
  EXPORT_SYMBOL_GPL(page_cache_async_readahead);
  
-static ssize_t
-do_readahead(struct address_space *mapping, struct file *filp,
-            pgoff_t index, unsigned long nr)
-{
-       if (!mapping || !mapping->a_ops)
-               return -EINVAL;
-
-       /*
-        * Readahead doesn't make sense for DAX inodes, but we don't want it
-        * to report a failure either.  Instead, we just return success and
-        * don't do any work.
-        */
-       if (dax_mapping(mapping))
-               return 0;
-
-       return force_page_cache_readahead(mapping, filp, index, nr);
-}
-
  ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
  {
         ssize_t ret;
@@ -600,16 +583,22 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
  
         ret = -EBADF;
         f = fdget(fd);
-       if (f.file) {
-               if (f.file->f_mode & FMODE_READ) {
-                       struct address_space *mapping = f.file->f_mapping;
-                       pgoff_t start = offset >> PAGE_SHIFT;
-                       pgoff_t end = (offset + count - 1) >> PAGE_SHIFT;
-                       unsigned long len = end - start + 1;
-                       ret = do_readahead(mapping, f.file, start, len);
-               }
-               fdput(f);
-       }
+       if (!f.file || !(f.file->f_mode & FMODE_READ))
+               goto out;
+
+       /*
+        * The readahead() syscall is intended to run only on files
+        * that can execute readahead. If readahead is not possible
+        * on this file, then we must return -EINVAL.
+        */
+       ret = -EINVAL;
+       if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
+           !S_ISREG(file_inode(f.file)->i_mode))
+               goto out;
+
+       ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
+out:
+       fdput(f);
         return ret;
  }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 14 Sep 2018 05:21:40 +0000 (19:21 -1000)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 14 Sep 2018 05:21:40 +0000 (19:21 -1000)
Documentation/filesystems/vfs.txt		patch \| blob \| history
fs/overlayfs/file.c		patch \| blob \| history
fs/overlayfs/inode.c		patch \| blob \| history
fs/overlayfs/super.c		patch \| blob \| history
include/linux/fs.h		patch \| blob \| history
mm/Makefile		patch \| blob \| history
mm/fadvise.c		patch \| blob \| history
mm/readahead.c		patch \| blob \| history