Merge branch 'akpm' (patches from Andrew)

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 30 Nov 2017 03:12:44 +0000 (19:12 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 30 Nov 2017 03:12:44 +0000 (19:12 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 30 Nov 2017 03:12:44 +0000 (19:12 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 30 Nov 2017 03:12:44 +0000 (19:12 -0800)
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt

index b920423..5025ff9 100644 (file)
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -158,10 +158,6 @@ Note: the minimum value allowed for dirty_bytes is two pages (in bytes); any
  value lower than this limit will be ignored and the old configuration will be
  retained.
  
-Note: the value of dirty_bytes also must be set greater than
-dirty_background_bytes or the amount of memory corresponding to
-dirty_background_ratio.
-
  ==============================================================
  
  dirty_expire_centisecs
@@ -181,9 +177,6 @@ generating disk writes will itself start writing out dirty data.
  
  The total available memory is not equal to total system memory.
  
-Note: dirty_ratio must be set greater than dirty_background_ratio or
-ratio corresponding to dirty_background_bytes.
-
  ==============================================================
  
  dirty_writeback_centisecs
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h

index 2a029bc..1a7a17b 100644 (file)
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -221,7 +221,6 @@ static inline pte_t pte_mkspecial(pte_t pte)
  }
  #define        __HAVE_ARCH_PTE_SPECIAL
  
-#define __HAVE_ARCH_PMD_WRITE
  #define pmd_write(pmd)         (pmd_isclear((pmd), L_PMD_SECT_RDONLY))
  #define pmd_dirty(pmd)         (pmd_isset((pmd), L_PMD_SECT_DIRTY))
  #define pud_page(pud)          pmd_page(__pmd(pud_val(pud)))
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h

index c9530b5..149d05f 100644 (file)
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -345,7 +345,6 @@ static inline int pmd_protnone(pmd_t pmd)
  
  #define pmd_thp_or_huge(pmd)   (pmd_huge(pmd) || pmd_trans_huge(pmd))
  
-#define __HAVE_ARCH_PMD_WRITE
  #define pmd_write(pmd)         pte_write(pmd_pte(pmd))
  
  #define pmd_mkhuge(pmd)                (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h

index 9e9e944..1a508a7 100644 (file)
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -552,7 +552,7 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
  extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                        pmd_t *pmdp, pmd_t pmd);
  
-#define __HAVE_ARCH_PMD_WRITE
+#define pmd_write pmd_write
  static inline int pmd_write(pmd_t pmd)
  {
         return !!(pmd_val(pmd) & _PAGE_WRITE);
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h

index 9a677cd..4469781 100644 (file)
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1005,7 +1005,6 @@ static inline int pmd_protnone(pmd_t pmd)
  }
  #endif /* CONFIG_NUMA_BALANCING */
  
-#define __HAVE_ARCH_PMD_WRITE
  #define pmd_write(pmd)         pte_write(pmd_pte(pmd))
  #define __pmd_write(pmd)       __pte_write(pmd_pte(pmd))
  #define pmd_savedwrite(pmd)    pte_savedwrite(pmd_pte(pmd))
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h

index d7fe983..57d7bc9 100644 (file)
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -709,7 +709,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
         return (pmd_val(pmd) & origin_mask) >> PAGE_SHIFT;
  }
  
-#define __HAVE_ARCH_PMD_WRITE
+#define pmd_write pmd_write
  static inline int pmd_write(pmd_t pmd)
  {
         return (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE) != 0;
@@ -1264,6 +1264,12 @@ static inline pud_t pud_mkwrite(pud_t pud)
         return pud;
  }
  
+#define pud_write pud_write
+static inline int pud_write(pud_t pud)
+{
+       return (pud_val(pud) & _REGION3_ENTRY_WRITE) != 0;
+}
+
  static inline pud_t pud_mkclean(pud_t pud)
  {
         if (pud_large(pud)) {
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h

index 5a9e96b..9937c5f 100644 (file)
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -715,7 +715,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
         return pte_pfn(pte);
  }
  
-#define __HAVE_ARCH_PMD_WRITE
+#define pmd_write pmd_write
  static inline unsigned long pmd_write(pmd_t pmd)
  {
         pte_t pte = __pte(pmd_val(pmd));
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c

index 5335ba3..33c0f8b 100644 (file)
--- a/arch/sparc/mm/gup.c
+++ b/arch/sparc/mm/gup.c
@@ -75,7 +75,7 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
         if (!(pmd_val(pmd) & _PAGE_VALID))
                 return 0;
  
-       if (write && !pmd_write(pmd))
+       if (!pmd_access_permitted(pmd, write))
                 return 0;
  
         refs = 0;
@@ -114,7 +114,7 @@ static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr,
         if (!(pud_val(pud) & _PAGE_VALID))
                 return 0;
  
-       if (write && !pud_write(pud))
+       if (!pud_access_permitted(pud, write))
                 return 0;
  
         refs = 0;
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h

index 2a26cc4..adfa21b 100644 (file)
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -475,7 +475,6 @@ static inline void pmd_clear(pmd_t *pmdp)
  #define pmd_mkdirty(pmd)       pte_pmd(pte_mkdirty(pmd_pte(pmd)))
  #define pmd_huge_page(pmd)     pte_huge(pmd_pte(pmd))
  #define pmd_mkhuge(pmd)                pte_pmd(pte_mkhuge(pmd_pte(pmd)))
-#define __HAVE_ARCH_PMD_WRITE
  
  #define pfn_pmd(pfn, pgprot)   pte_pmd(pfn_pte((pfn), (pgprot)))
  #define pmd_pfn(pmd)           pte_pfn(pmd_pte(pmd))
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h

index 09f9e1e..95e2dfd 100644 (file)
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1061,7 +1061,7 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                   unsigned long address, pmd_t *pmdp);
  
  
-#define __HAVE_ARCH_PMD_WRITE
+#define pmd_write pmd_write
  static inline int pmd_write(pmd_t pmd)
  {
         return pmd_flags(pmd) & _PAGE_RW;
@@ -1088,6 +1088,12 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
         clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
  }
  
+#define pud_write pud_write
+static inline int pud_write(pud_t pud)
+{
+       return pud_flags(pud) & _PAGE_RW;
+}
+
  /*
   * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
   *
diff --git a/drivers/dax/device.c b/drivers/dax/device.c

index 6833ada..7b0bf82 100644 (file)
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -428,9 +428,21 @@ static int dev_dax_fault(struct vm_fault *vmf)
         return dev_dax_huge_fault(vmf, PE_SIZE_PTE);
  }
  
+static int dev_dax_split(struct vm_area_struct *vma, unsigned long addr)
+{
+       struct file *filp = vma->vm_file;
+       struct dev_dax *dev_dax = filp->private_data;
+       struct dax_region *dax_region = dev_dax->region;
+
+       if (!IS_ALIGNED(addr, dax_region->align))
+               return -EINVAL;
+       return 0;
+}
+
  static const struct vm_operations_struct dax_vm_ops = {
         .fault = dev_dax_fault,
         .huge_fault = dev_dax_huge_fault,
+       .split = dev_dax_split,
  };
  
  static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c

index 21e60b1..130606c 100644 (file)
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -191,7 +191,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
         sg_list_start = umem->sg_head.sgl;
  
         while (npages) {
-               ret = get_user_pages(cur_base,
+               ret = get_user_pages_longterm(cur_base,
                                      min_t(unsigned long, npages,
                                            PAGE_SIZE / sizeof (struct page *)),
                                      gup_flags, page_list, vma_list);
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c

index 0b5c43f..f412429 100644 (file)
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -185,12 +185,13 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma,
         dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n",
                 data, size, dma->nr_pages);
  
-       err = get_user_pages(data & PAGE_MASK, dma->nr_pages,
+       err = get_user_pages_longterm(data & PAGE_MASK, dma->nr_pages,
                              flags, dma->pages, NULL);
  
         if (err != dma->nr_pages) {
                 dma->nr_pages = (err >= 0) ? err : 0;
-               dprintk(1, "get_user_pages: err=%d [%d]\n", err, dma->nr_pages);
+               dprintk(1, "get_user_pages_longterm: err=%d [%d]\n", err,
+                       dma->nr_pages);
                 return err < 0 ? err : -EINVAL;
         }
         return 0;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c

index d79ced9..82e8f6e 100644 (file)
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -281,8 +281,8 @@ static int autofs4_mount_wait(const struct path *path, bool rcu_walk)
                 pr_debug("waiting for mount name=%pd\n", path->dentry);
                 status = autofs4_wait(sbi, path, NFY_MOUNT);
                 pr_debug("mount wait done status=%d\n", status);
-               ino->last_used = jiffies;
         }
+       ino->last_used = jiffies;
         return status;
  }
  
@@ -321,21 +321,16 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
          */
         if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
                 struct dentry *parent = dentry->d_parent;
+               struct autofs_info *ino;
                 struct dentry *new;
  
                 new = d_lookup(parent, &dentry->d_name);
                 if (!new)
                         return NULL;
-               if (new == dentry)
-                       dput(new);
-               else {
-                       struct autofs_info *ino;
-
-                       ino = autofs4_dentry_ino(new);
-                       ino->last_used = jiffies;
-                       dput(path->dentry);
-                       path->dentry = new;
-               }
+               ino = autofs4_dentry_ino(new);
+               ino->last_used = jiffies;
+               dput(path->dentry);
+               path->dentry = new;
         }
         return path->dentry;
  }
diff --git a/fs/dax.c b/fs/dax.c

index 9598159..78b72c4 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -627,7 +627,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
  
                         if (pfn != pmd_pfn(*pmdp))
                                 goto unlock_pmd;
-                       if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
+                       if (!pmd_dirty(*pmdp)
+                                       && !pmd_access_permitted(*pmdp, WRITE))
                                 goto unlock_pmd;
  
                         flush_cache_page(vma, address, pfn);
diff --git a/fs/exec.c b/fs/exec.c

index 1d6243d..6be2aa0 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1340,10 +1340,15 @@ void setup_new_exec(struct linux_binprm * bprm)
                  * avoid bad behavior from the prior rlimits. This has to
                  * happen before arch_pick_mmap_layout(), which examines
                  * RLIMIT_STACK, but after the point of no return to avoid
-                * needing to clean up the change on failure.
+                * races from other threads changing the limits. This also
+                * must be protected from races with prlimit() calls.
                  */
+               task_lock(current->group_leader);
                 if (current->signal->rlim[RLIMIT_STACK].rlim_cur > _STK_LIM)
                         current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM;
+               if (current->signal->rlim[RLIMIT_STACK].rlim_max > _STK_LIM)
+                       current->signal->rlim[RLIMIT_STACK].rlim_max = _STK_LIM;
+               task_unlock(current->group_leader);
         }
  
         arch_pick_mmap_layout(current->mm);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c

index 016c46b..20a0a89 100644 (file)
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -779,7 +779,7 @@ static void __exit fat_destroy_inodecache(void)
  
  static int fat_remount(struct super_block *sb, int *flags, char *data)
  {
-       int new_rdonly;
+       bool new_rdonly;
         struct msdos_sb_info *sbi = MSDOS_SB(sb);
         *flags |= SB_NODIRATIME | (sbi->options.isvfat ? 0 : SB_NOATIME);
  
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c

index 1e76730..8a85f3f 100644 (file)
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -639,11 +639,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
                 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
  
                 /*
-                * page_put due to reference from alloc_huge_page()
                  * unlock_page because locked by add_to_page_cache()
+                * page_put due to reference from alloc_huge_page()
                  */
-               put_page(page);
                 unlock_page(page);
+               put_page(page);
         }
  
         if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
diff --git a/fs/mbcache.c b/fs/mbcache.c

index d818fd2..b8b8b9c 100644 (file)
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -269,6 +269,9 @@ static unsigned long mb_cache_count(struct shrinker *shrink,
         struct mb_cache *cache = container_of(shrink, struct mb_cache,
                                               c_shrink);
  
+       /* Unlikely, but not impossible */
+       if (unlikely(cache->c_entry_count < 0))
+               return 0;
         return cache->c_entry_count;
  }
  
diff --git a/fs/namei.c b/fs/namei.c

index f0c7a7b..9cc91fb 100644 (file)
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1129,18 +1129,9 @@ static int follow_automount(struct path *path, struct nameidata *nd,
          * of the daemon to instantiate them before they can be used.
          */
         if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
-                          LOOKUP_OPEN | LOOKUP_CREATE |
-                          LOOKUP_AUTOMOUNT))) {
-               /* Positive dentry that isn't meant to trigger an
-                * automount, EISDIR will allow it to be used,
-                * otherwise there's no mount here "now" so return
-                * ENOENT.
-                */
-               if (path->dentry->d_inode)
-                       return -EISDIR;
-               else
-                       return -ENOENT;
-       }
+                          LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
+           path->dentry->d_inode)
+               return -EISDIR;
  
         if (path->dentry->d_sb->s_user_ns != &init_user_ns)
                 return -EACCES;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h

index 757dc6f..b234d54 100644 (file)
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -805,15 +805,23 @@ static inline int pmd_trans_huge(pmd_t pmd)
  {
         return 0;
  }
-#ifndef __HAVE_ARCH_PMD_WRITE
+#ifndef pmd_write
  static inline int pmd_write(pmd_t pmd)
  {
         BUG();
         return 0;
  }
-#endif /* __HAVE_ARCH_PMD_WRITE */
+#endif /* pmd_write */
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  
+#ifndef pud_write
+static inline int pud_write(pud_t pud)
+{
+       BUG();
+       return 0;
+}
+#endif /* pud_write */
+
  #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
         (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
          !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
diff --git a/include/linux/fs.h b/include/linux/fs.h

index bbd92da..511fbaa 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3088,7 +3088,8 @@ static inline int vfs_lstat(const char __user *name, struct kstat *stat)
  static inline int vfs_fstatat(int dfd, const char __user *filename,
                               struct kstat *stat, int flags)
  {
-       return vfs_statx(dfd, filename, flags, stat, STATX_BASIC_STATS);
+       return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT,
+                        stat, STATX_BASIC_STATS);
  }
  static inline int vfs_fstat(int fd, struct kstat *stat)
  {
@@ -3194,6 +3195,20 @@ static inline bool vma_is_dax(struct vm_area_struct *vma)
         return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
  }
  
+static inline bool vma_is_fsdax(struct vm_area_struct *vma)
+{
+       struct inode *inode;
+
+       if (!vma->vm_file)
+               return false;
+       if (!vma_is_dax(vma))
+               return false;
+       inode = file_inode(vma->vm_file);
+       if (inode->i_mode == S_IFCHR)
+               return false; /* device-dax */
+       return true;
+}
+
  static inline int iocb_flags(struct file *file)
  {
         int res = 0;
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h

index fbf5b31..82a2588 100644 (file)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -239,14 +239,6 @@ static inline int pgd_write(pgd_t pgd)
  }
  #endif
  
-#ifndef pud_write
-static inline int pud_write(pud_t pud)
-{
-       BUG();
-       return 0;
-}
-#endif
-
  #define HUGETLB_ANON_FILE "anon_hugepage"
  
  enum {
diff --git a/include/linux/migrate.h b/include/linux/migrate.h

index 895ec0c..a2246cf 100644 (file)
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -54,7 +54,7 @@ static inline struct page *new_page_nodemask(struct page *page,
         new_page = __alloc_pages_nodemask(gfp_mask, order,
                                 preferred_nid, nodemask);
  
-       if (new_page && PageTransHuge(page))
+       if (new_page && PageTransHuge(new_page))
                 prep_transhuge_page(new_page);
  
         return new_page;
diff --git a/include/linux/mm.h b/include/linux/mm.h

index ee07314..ea818ff 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -377,6 +377,7 @@ enum page_entry_size {
  struct vm_operations_struct {
         void (*open)(struct vm_area_struct * area);
         void (*close)(struct vm_area_struct * area);
+       int (*split)(struct vm_area_struct * area, unsigned long addr);
         int (*mremap)(struct vm_area_struct * area);
         int (*fault)(struct vm_fault *vmf);
         int (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size);
@@ -1379,6 +1380,19 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
                     unsigned int gup_flags, struct page **pages, int *locked);
  long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                     struct page **pages, unsigned int gup_flags);
+#ifdef CONFIG_FS_DAX
+long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
+                           unsigned int gup_flags, struct page **pages,
+                           struct vm_area_struct **vmas);
+#else
+static inline long get_user_pages_longterm(unsigned long start,
+               unsigned long nr_pages, unsigned int gup_flags,
+               struct page **pages, struct vm_area_struct **vmas)
+{
+       return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+}
+#endif /* CONFIG_FS_DAX */
+
  int get_user_pages_fast(unsigned long start, int nr_pages, int write,
                         struct page **pages);
  
diff --git a/mm/frame_vector.c b/mm/frame_vector.c

index 2f98df0..297c723 100644 (file)
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -53,6 +53,18 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
                 ret = -EFAULT;
                 goto out;
         }
+
+       /*
+        * While get_vaddr_frames() could be used for transient (kernel
+        * controlled lifetime) pinning of memory pages all current
+        * users establish long term (userspace controlled lifetime)
+        * page pinning. Treat get_vaddr_frames() like
+        * get_user_pages_longterm() and disallow it for filesystem-dax
+        * mappings.
+        */
+       if (vma_is_fsdax(vma))
+               return -EOPNOTSUPP;
+
         if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
                 vec->got_ref = true;
                 vec->is_pfns = false;
diff --git a/mm/gup.c b/mm/gup.c

index dfcde13..d3fb60e 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -66,7 +66,7 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
   */
  static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
  {
-       return pte_write(pte) ||
+       return pte_access_permitted(pte, WRITE) ||
                 ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
  }
  
@@ -1095,6 +1095,70 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
  }
  EXPORT_SYMBOL(get_user_pages);
  
+#ifdef CONFIG_FS_DAX
+/*
+ * This is the same as get_user_pages() in that it assumes we are
+ * operating on the current task's mm, but it goes further to validate
+ * that the vmas associated with the address range are suitable for
+ * longterm elevated page reference counts. For example, filesystem-dax
+ * mappings are subject to the lifetime enforced by the filesystem and
+ * we need guarantees that longterm users like RDMA and V4L2 only
+ * establish mappings that have a kernel enforced revocation mechanism.
+ *
+ * "longterm" == userspace controlled elevated page count lifetime.
+ * Contrast this to iov_iter_get_pages() usages which are transient.
+ */
+long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
+               unsigned int gup_flags, struct page **pages,
+               struct vm_area_struct **vmas_arg)
+{
+       struct vm_area_struct **vmas = vmas_arg;
+       struct vm_area_struct *vma_prev = NULL;
+       long rc, i;
+
+       if (!pages)
+               return -EINVAL;
+
+       if (!vmas) {
+               vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
+                              GFP_KERNEL);
+               if (!vmas)
+                       return -ENOMEM;
+       }
+
+       rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+
+       for (i = 0; i < rc; i++) {
+               struct vm_area_struct *vma = vmas[i];
+
+               if (vma == vma_prev)
+                       continue;
+
+               vma_prev = vma;
+
+               if (vma_is_fsdax(vma))
+                       break;
+       }
+
+       /*
+        * Either get_user_pages() failed, or the vma validation
+        * succeeded, in either case we don't need to put_page() before
+        * returning.
+        */
+       if (i >= rc)
+               goto out;
+
+       for (i = 0; i < rc; i++)
+               put_page(pages[i]);
+       rc = -EOPNOTSUPP;
+out:
+       if (vmas != vmas_arg)
+               kfree(vmas);
+       return rc;
+}
+EXPORT_SYMBOL(get_user_pages_longterm);
+#endif /* CONFIG_FS_DAX */
+
  /**
   * populate_vma_page_range() -  populate a range of pages in the vma.
   * @vma:   target vma
diff --git a/mm/hmm.c b/mm/hmm.c

index ea19742..3a5c172 100644 (file)
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -391,11 +391,11 @@ again:
                 if (pmd_protnone(pmd))
                         return hmm_vma_walk_clear(start, end, walk);
  
-               if (write_fault && !pmd_write(pmd))
+               if (!pmd_access_permitted(pmd, write_fault))
                         return hmm_vma_walk_clear(start, end, walk);
  
                 pfn = pmd_pfn(pmd) + pte_index(addr);
-               flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0;
+               flag |= pmd_access_permitted(pmd, WRITE) ? HMM_PFN_WRITE : 0;
                 for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
                         pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag;
                 return 0;
@@ -456,11 +456,11 @@ again:
                         continue;
                 }
  
-               if (write_fault && !pte_write(pte))
+               if (!pte_access_permitted(pte, write_fault))
                         goto fault;
  
                 pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag;
-               pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0;
+               pfns[i] |= pte_access_permitted(pte, WRITE) ? HMM_PFN_WRITE : 0;
                 continue;
  
  fault:
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 0e7ded9..2f2f5e7 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -870,7 +870,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
          */
         WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
  
-       if (flags & FOLL_WRITE && !pmd_write(*pmd))
+       if (!pmd_access_permitted(*pmd, flags & FOLL_WRITE))
                 return NULL;
  
         if (pmd_present(*pmd) && pmd_devmap(*pmd))
@@ -1012,7 +1012,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
  
         assert_spin_locked(pud_lockptr(mm, pud));
  
-       if (flags & FOLL_WRITE && !pud_write(*pud))
+       if (!pud_access_permitted(*pud, flags & FOLL_WRITE))
                 return NULL;
  
         if (pud_present(*pud) && pud_devmap(*pud))
@@ -1386,7 +1386,7 @@ out_unlock:
   */
  static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
  {
-       return pmd_write(pmd) ||
+       return pmd_access_permitted(pmd, WRITE) ||
                ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
  }
  
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 681b300..9a334f5 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3125,6 +3125,13 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
         }
  }
  
+static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
+{
+       if (addr & ~(huge_page_mask(hstate_vma(vma))))
+               return -EINVAL;
+       return 0;
+}
+
  /*
   * We cannot handle pagefaults against hugetlb pages at all.  They cause
   * handle_mm_fault() to try to instantiate regular-sized pages in the
@@ -3141,6 +3148,7 @@ const struct vm_operations_struct hugetlb_vm_ops = {
         .fault = hugetlb_vm_op_fault,
         .open = hugetlb_vm_op_open,
         .close = hugetlb_vm_op_close,
+       .split = hugetlb_vm_op_split,
  };
  
  static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -4627,7 +4635,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
         pte_t *pte = NULL;
  
         pgd = pgd_offset(mm, addr);
-       p4d = p4d_offset(pgd, addr);
+       p4d = p4d_alloc(mm, pgd, addr);
+       if (!p4d)
+               return NULL;
         pud = pud_alloc(mm, p4d, addr);
         if (pud) {
                 if (sz == PUD_SIZE) {
diff --git a/mm/kmemleak.c b/mm/kmemleak.c

index e4738d5..3d47817 100644 (file)
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1523,6 +1523,8 @@ static void kmemleak_scan(void)
                         if (page_count(page) == 0)
                                 continue;
                         scan_block(page, page + 1, NULL);
+                       if (!(pfn % (MAX_SCAN_SIZE / sizeof(*page))))
+                               cond_resched();
                 }
         }
         put_online_mems();
diff --git a/mm/madvise.c b/mm/madvise.c

index 375cf32..751e97a 100644 (file)
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -276,15 +276,14 @@ static long madvise_willneed(struct vm_area_struct *vma,
  {
         struct file *file = vma->vm_file;
  
+       *prev = vma;
  #ifdef CONFIG_SWAP
         if (!file) {
-               *prev = vma;
                 force_swapin_readahead(vma, start, end);
                 return 0;
         }
  
         if (shmem_mapping(file->f_mapping)) {
-               *prev = vma;
                 force_shm_swapin_readahead(vma, start, end,
                                         file->f_mapping);
                 return 0;
@@ -299,7 +298,6 @@ static long madvise_willneed(struct vm_area_struct *vma,
                 return 0;
         }
  
-       *prev = vma;
         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
         if (end > vma->vm_end)
                 end = vma->vm_end;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 50e6906..ac2ffd5 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6044,7 +6044,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
         memcg_check_events(memcg, page);
  
         if (!mem_cgroup_is_root(memcg))
-               css_put(&memcg->css);
+               css_put_many(&memcg->css, nr_entries);
  }
  
  /**
diff --git a/mm/memory.c b/mm/memory.c

index 85e7a87..5eb3d25 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3948,7 +3948,7 @@ static int handle_pte_fault(struct vm_fault *vmf)
         if (unlikely(!pte_same(*vmf->pte, entry)))
                 goto unlock;
         if (vmf->flags & FAULT_FLAG_WRITE) {
-               if (!pte_write(entry))
+               if (!pte_access_permitted(entry, WRITE))
                         return do_wp_page(vmf);
                 entry = pte_mkdirty(entry);
         }
@@ -4013,7 +4013,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
  
                         /* NUMA case for anonymous PUDs would go here */
  
-                       if (dirty && !pud_write(orig_pud)) {
+                       if (dirty && !pud_access_permitted(orig_pud, WRITE)) {
                                 ret = wp_huge_pud(&vmf, orig_pud);
                                 if (!(ret & VM_FAULT_FALLBACK))
                                         return ret;
@@ -4046,7 +4046,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                         if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
                                 return do_huge_pmd_numa_page(&vmf, orig_pmd);
  
-                       if (dirty && !pmd_write(orig_pmd)) {
+                       if (dirty && !pmd_access_permitted(orig_pmd, WRITE)) {
                                 ret = wp_huge_pmd(&vmf, orig_pmd);
                                 if (!(ret & VM_FAULT_FALLBACK))
                                         return ret;
@@ -4336,7 +4336,7 @@ int follow_phys(struct vm_area_struct *vma,
                 goto out;
         pte = *ptep;
  
-       if ((flags & FOLL_WRITE) && !pte_write(pte))
+       if (!pte_access_permitted(pte, flags & FOLL_WRITE))
                 goto unlock;
  
         *prot = pgprot_val(pte_pgprot(pte));
diff --git a/mm/mmap.c b/mm/mmap.c

index 924839f..a4d5468 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2555,9 +2555,11 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
         struct vm_area_struct *new;
         int err;
  
-       if (is_vm_hugetlb_page(vma) && (addr &
-                                       ~(huge_page_mask(hstate_vma(vma)))))
-               return -EINVAL;
+       if (vma->vm_ops && vma->vm_ops->split) {
+               err = vma->vm_ops->split(vma, addr);
+               if (err)
+                       return err;
+       }
  
         new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
         if (!new)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c

index c86fbd1..c957be3 100644 (file)
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -550,7 +550,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
          */
         set_bit(MMF_UNSTABLE, &mm->flags);
  
-       tlb_gather_mmu(&tlb, mm, 0, -1);
         for (vma = mm->mmap ; vma; vma = vma->vm_next) {
                 if (!can_madv_dontneed_vma(vma))
                         continue;
@@ -565,11 +564,13 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
                  * we do not want to block exit_mmap by keeping mm ref
                  * count elevated without a good reason.
                  */
-               if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
+               if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
+                       tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
                         unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
                                          NULL);
+                       tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
+               }
         }
-       tlb_finish_mmu(&tlb, 0, -1);
         pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
                         task_pid_nr(tsk), tsk->comm,
                         K(get_mm_counter(mm, MM_ANONPAGES)),
diff --git a/mm/page-writeback.c b/mm/page-writeback.c

index e709503..586f312 100644 (file)
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -433,11 +433,8 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
         else
                 bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
  
-       if (unlikely(bg_thresh >= thresh)) {
-               pr_warn("vm direct limit must be set greater than background limit.\n");
+       if (bg_thresh >= thresh)
                 bg_thresh = thresh / 2;
-       }
-
         tsk = current;
         if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
                 bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index d4096f4..73f5d45 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2507,10 +2507,6 @@ void drain_all_pages(struct zone *zone)
         if (WARN_ON_ONCE(!mm_percpu_wq))
                 return;
  
-       /* Workqueues cannot recurse */
-       if (current->flags & PF_WQ_WORKER)
-               return;
-
         /*
          * Do not drain if one is already in progress unless it's specific to
          * a zone. Such callers are primarily CMA and memory hotplug and need
@@ -7656,11 +7652,18 @@ int alloc_contig_range(unsigned long start, unsigned long end,
  
         /*
          * In case of -EBUSY, we'd like to know which page causes problem.
-        * So, just fall through. We will check it in test_pages_isolated().
+        * So, just fall through. test_pages_isolated() has a tracepoint
+        * which will report the busy page.
+        *
+        * It is possible that busy pages could become available before
+        * the call to test_pages_isolated, and the range will actually be
+        * allocated.  So, if we fall through be sure to clear ret so that
+        * -EBUSY is not accidentally used or returned to caller.
          */
         ret = __alloc_contig_migrate_range(&cc, start, end);
         if (ret && ret != -EBUSY)
                 goto done;
+       ret =0;
  
         /*
          * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter

index 6f099f9..94b6648 100755 (executable)
--- a/scripts/bloat-o-meter
+++ b/scripts/bloat-o-meter
@@ -83,8 +83,11 @@ def print_result(symboltype, symbolformat, argc):
      for d, n in delta:
          if d: print("%-40s %7s %7s %+7d" % (n, old.get(n,"-"), new.get(n,"-"), d))
  
-    print("Total: Before=%d, After=%d, chg %+.2f%%" % \
-        (otot, ntot, (ntot - otot)*100.0/otot))
+    if otot:
+        percent = (ntot - otot) * 100.0 / otot
+    else:
+        percent = 0
+    print("Total: Before=%d, After=%d, chg %+.2f%%" % (otot, ntot, percent))
  
  if sys.argv[1] == "-c":
      print_result("Function", "tT", 3)
diff --git a/scripts/faddr2line b/scripts/faddr2line

index 1f5ce95..39e07d8 100755 (executable)
--- a/scripts/faddr2line
+++ b/scripts/faddr2line
@@ -44,9 +44,16 @@
  set -o errexit
  set -o nounset
  
+READELF="${CROSS_COMPILE}readelf"
+ADDR2LINE="${CROSS_COMPILE}addr2line"
+SIZE="${CROSS_COMPILE}size"
+NM="${CROSS_COMPILE}nm"
+
  command -v awk >/dev/null 2>&1 || die "awk isn't installed"
-command -v readelf >/dev/null 2>&1 || die "readelf isn't installed"
-command -v addr2line >/dev/null 2>&1 || die "addr2line isn't installed"
+command -v ${READELF} >/dev/null 2>&1 || die "readelf isn't installed"
+command -v ${ADDR2LINE} >/dev/null 2>&1 || die "addr2line isn't installed"
+command -v ${SIZE} >/dev/null 2>&1 || die "size isn't installed"
+command -v ${NM} >/dev/null 2>&1 || die "nm isn't installed"
  
  usage() {
         echo "usage: faddr2line <object file> <func+offset> <func+offset>..." >&2
@@ -69,10 +76,10 @@ die() {
  find_dir_prefix() {
         local objfile=$1
  
-       local start_kernel_addr=$(readelf -sW $objfile | awk '$8 == "start_kernel" {printf "0x%s", $2}')
+       local start_kernel_addr=$(${READELF} -sW $objfile | awk '$8 == "start_kernel" {printf "0x%s", $2}')
         [[ -z $start_kernel_addr ]] && return
  
-       local file_line=$(addr2line -e $objfile $start_kernel_addr)
+       local file_line=$(${ADDR2LINE} -e $objfile $start_kernel_addr)
         [[ -z $file_line ]] && return
  
         local prefix=${file_line%init/main.c:*}
@@ -104,7 +111,7 @@ __faddr2line() {
  
         # Go through each of the object's symbols which match the func name.
         # In rare cases there might be duplicates.
-       file_end=$(size -Ax $objfile | awk '$1 == ".text" {print $2}')
+       file_end=$(${SIZE} -Ax $objfile | awk '$1 == ".text" {print $2}')
         while read symbol; do
                 local fields=($symbol)
                 local sym_base=0x${fields[0]}
@@ -156,10 +163,10 @@ __faddr2line() {
  
                 # pass real address to addr2line
                 echo "$func+$offset/$sym_size:"
-               addr2line -fpie $objfile $addr | sed "s; $dir_prefix\(\./\)*; ;"
+               ${ADDR2LINE} -fpie $objfile $addr | sed "s; $dir_prefix\(\./\)*; ;"
                 DONE=1
  
-       done < <(nm -n $objfile | awk -v fn=$func -v end=$file_end '$3 == fn { found=1; line=$0; start=$1; next } found == 1 { found=0; print line, "0x"$1 } END {if (found == 1) print line, end; }')
+       done < <(${NM} -n $objfile | awk -v fn=$func -v end=$file_end '$3 == fn { found=1; line=$0; start=$1; next } found == 1 { found=0; print line, "0x"$1 } END {if (found == 1) print line, end; }')
  }
  
  [[ $# -lt 2 ]] && usage
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 30 Nov 2017 03:12:44 +0000 (19:12 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 30 Nov 2017 03:12:44 +0000 (19:12 -0800)
Documentation/sysctl/vm.txt		patch \| blob \| history
arch/arm/include/asm/pgtable-3level.h		patch \| blob \| history
arch/arm64/include/asm/pgtable.h		patch \| blob \| history
arch/mips/include/asm/pgtable.h		patch \| blob \| history
arch/powerpc/include/asm/book3s/64/pgtable.h		patch \| blob \| history
arch/s390/include/asm/pgtable.h		patch \| blob \| history
arch/sparc/include/asm/pgtable_64.h		patch \| blob \| history
arch/sparc/mm/gup.c		patch \| blob \| history
arch/tile/include/asm/pgtable.h		patch \| blob \| history
arch/x86/include/asm/pgtable.h		patch \| blob \| history
drivers/dax/device.c		patch \| blob \| history
drivers/infiniband/core/umem.c		patch \| blob \| history
drivers/media/v4l2-core/videobuf-dma-sg.c		patch \| blob \| history
fs/autofs4/root.c		patch \| blob \| history
fs/dax.c		patch \| blob \| history
fs/exec.c		patch \| blob \| history
fs/fat/inode.c		patch \| blob \| history
fs/hugetlbfs/inode.c		patch \| blob \| history
fs/mbcache.c		patch \| blob \| history
fs/namei.c		patch \| blob \| history
include/asm-generic/pgtable.h		patch \| blob \| history
include/linux/fs.h		patch \| blob \| history
include/linux/hugetlb.h		patch \| blob \| history
include/linux/migrate.h		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
mm/frame_vector.c		patch \| blob \| history
mm/gup.c		patch \| blob \| history
mm/hmm.c		patch \| blob \| history
mm/huge_memory.c		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history
mm/kmemleak.c		patch \| blob \| history
mm/madvise.c		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/mmap.c		patch \| blob \| history
mm/oom_kill.c		patch \| blob \| history
mm/page-writeback.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
scripts/bloat-o-meter		patch \| blob \| history
scripts/faddr2line		patch \| blob \| history