Merge branch 'akpm' (patches from Andrew)
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 30 Nov 2017 03:12:44 +0000 (19:12 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 30 Nov 2017 03:12:44 +0000 (19:12 -0800)
Mergr misc fixes from Andrew Morton:
 "28 fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (28 commits)
  fs/hugetlbfs/inode.c: change put_page/unlock_page order in hugetlbfs_fallocate()
  mm/hugetlb: fix NULL-pointer dereference on 5-level paging machine
  autofs: revert "autofs: fix AT_NO_AUTOMOUNT not being honored"
  autofs: revert "autofs: take more care to not update last_used on path walk"
  fs/fat/inode.c: fix sb_rdonly() change
  mm, memcg: fix mem_cgroup_swapout() for THPs
  mm: migrate: fix an incorrect call of prep_transhuge_page()
  kmemleak: add scheduling point to kmemleak_scan()
  scripts/bloat-o-meter: don't fail with division by 0
  fs/mbcache.c: make count_objects() more robust
  Revert "mm/page-writeback.c: print a warning if the vm dirtiness settings are illogical"
  mm/madvise.c: fix madvise() infinite loop under special circumstances
  exec: avoid RLIMIT_STACK races with prlimit()
  IB/core: disable memory registration of filesystem-dax vmas
  v4l2: disable filesystem-dax mapping support
  mm: fail get_vaddr_frames() for filesystem-dax mappings
  mm: introduce get_user_pages_longterm
  device-dax: implement ->split() to catch invalid munmap attempts
  mm, hugetlbfs: introduce ->split() to vm_operations_struct
  scripts/faddr2line: extend usage on generic arch
  ...

40 files changed:
Documentation/sysctl/vm.txt
arch/arm/include/asm/pgtable-3level.h
arch/arm64/include/asm/pgtable.h
arch/mips/include/asm/pgtable.h
arch/powerpc/include/asm/book3s/64/pgtable.h
arch/s390/include/asm/pgtable.h
arch/sparc/include/asm/pgtable_64.h
arch/sparc/mm/gup.c
arch/tile/include/asm/pgtable.h
arch/x86/include/asm/pgtable.h
drivers/dax/device.c
drivers/infiniband/core/umem.c
drivers/media/v4l2-core/videobuf-dma-sg.c
fs/autofs4/root.c
fs/dax.c
fs/exec.c
fs/fat/inode.c
fs/hugetlbfs/inode.c
fs/mbcache.c
fs/namei.c
include/asm-generic/pgtable.h
include/linux/fs.h
include/linux/hugetlb.h
include/linux/migrate.h
include/linux/mm.h
mm/frame_vector.c
mm/gup.c
mm/hmm.c
mm/huge_memory.c
mm/hugetlb.c
mm/kmemleak.c
mm/madvise.c
mm/memcontrol.c
mm/memory.c
mm/mmap.c
mm/oom_kill.c
mm/page-writeback.c
mm/page_alloc.c
scripts/bloat-o-meter
scripts/faddr2line

index b920423..5025ff9 100644 (file)
@@ -158,10 +158,6 @@ Note: the minimum value allowed for dirty_bytes is two pages (in bytes); any
 value lower than this limit will be ignored and the old configuration will be
 retained.
 
-Note: the value of dirty_bytes also must be set greater than
-dirty_background_bytes or the amount of memory corresponding to
-dirty_background_ratio.
-
 ==============================================================
 
 dirty_expire_centisecs
@@ -181,9 +177,6 @@ generating disk writes will itself start writing out dirty data.
 
 The total available memory is not equal to total system memory.
 
-Note: dirty_ratio must be set greater than dirty_background_ratio or
-ratio corresponding to dirty_background_bytes.
-
 ==============================================================
 
 dirty_writeback_centisecs
index 2a029bc..1a7a17b 100644 (file)
@@ -221,7 +221,6 @@ static inline pte_t pte_mkspecial(pte_t pte)
 }
 #define        __HAVE_ARCH_PTE_SPECIAL
 
-#define __HAVE_ARCH_PMD_WRITE
 #define pmd_write(pmd)         (pmd_isclear((pmd), L_PMD_SECT_RDONLY))
 #define pmd_dirty(pmd)         (pmd_isset((pmd), L_PMD_SECT_DIRTY))
 #define pud_page(pud)          pmd_page(__pmd(pud_val(pud)))
index c9530b5..149d05f 100644 (file)
@@ -345,7 +345,6 @@ static inline int pmd_protnone(pmd_t pmd)
 
 #define pmd_thp_or_huge(pmd)   (pmd_huge(pmd) || pmd_trans_huge(pmd))
 
-#define __HAVE_ARCH_PMD_WRITE
 #define pmd_write(pmd)         pte_write(pmd_pte(pmd))
 
 #define pmd_mkhuge(pmd)                (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
index 9e9e944..1a508a7 100644 (file)
@@ -552,7 +552,7 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
 extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                       pmd_t *pmdp, pmd_t pmd);
 
-#define __HAVE_ARCH_PMD_WRITE
+#define pmd_write pmd_write
 static inline int pmd_write(pmd_t pmd)
 {
        return !!(pmd_val(pmd) & _PAGE_WRITE);
index 9a677cd..4469781 100644 (file)
@@ -1005,7 +1005,6 @@ static inline int pmd_protnone(pmd_t pmd)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-#define __HAVE_ARCH_PMD_WRITE
 #define pmd_write(pmd)         pte_write(pmd_pte(pmd))
 #define __pmd_write(pmd)       __pte_write(pmd_pte(pmd))
 #define pmd_savedwrite(pmd)    pte_savedwrite(pmd_pte(pmd))
index d7fe983..57d7bc9 100644 (file)
@@ -709,7 +709,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
        return (pmd_val(pmd) & origin_mask) >> PAGE_SHIFT;
 }
 
-#define __HAVE_ARCH_PMD_WRITE
+#define pmd_write pmd_write
 static inline int pmd_write(pmd_t pmd)
 {
        return (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE) != 0;
@@ -1264,6 +1264,12 @@ static inline pud_t pud_mkwrite(pud_t pud)
        return pud;
 }
 
+#define pud_write pud_write
+static inline int pud_write(pud_t pud)
+{
+       return (pud_val(pud) & _REGION3_ENTRY_WRITE) != 0;
+}
+
 static inline pud_t pud_mkclean(pud_t pud)
 {
        if (pud_large(pud)) {
index 5a9e96b..9937c5f 100644 (file)
@@ -715,7 +715,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
        return pte_pfn(pte);
 }
 
-#define __HAVE_ARCH_PMD_WRITE
+#define pmd_write pmd_write
 static inline unsigned long pmd_write(pmd_t pmd)
 {
        pte_t pte = __pte(pmd_val(pmd));
index 5335ba3..33c0f8b 100644 (file)
@@ -75,7 +75,7 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
        if (!(pmd_val(pmd) & _PAGE_VALID))
                return 0;
 
-       if (write && !pmd_write(pmd))
+       if (!pmd_access_permitted(pmd, write))
                return 0;
 
        refs = 0;
@@ -114,7 +114,7 @@ static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr,
        if (!(pud_val(pud) & _PAGE_VALID))
                return 0;
 
-       if (write && !pud_write(pud))
+       if (!pud_access_permitted(pud, write))
                return 0;
 
        refs = 0;
index 2a26cc4..adfa21b 100644 (file)
@@ -475,7 +475,6 @@ static inline void pmd_clear(pmd_t *pmdp)
 #define pmd_mkdirty(pmd)       pte_pmd(pte_mkdirty(pmd_pte(pmd)))
 #define pmd_huge_page(pmd)     pte_huge(pmd_pte(pmd))
 #define pmd_mkhuge(pmd)                pte_pmd(pte_mkhuge(pmd_pte(pmd)))
-#define __HAVE_ARCH_PMD_WRITE
 
 #define pfn_pmd(pfn, pgprot)   pte_pmd(pfn_pte((pfn), (pgprot)))
 #define pmd_pfn(pmd)           pte_pfn(pmd_pte(pmd))
index 09f9e1e..95e2dfd 100644 (file)
@@ -1061,7 +1061,7 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp);
 
 
-#define __HAVE_ARCH_PMD_WRITE
+#define pmd_write pmd_write
 static inline int pmd_write(pmd_t pmd)
 {
        return pmd_flags(pmd) & _PAGE_RW;
@@ -1088,6 +1088,12 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
        clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
 }
 
+#define pud_write pud_write
+static inline int pud_write(pud_t pud)
+{
+       return pud_flags(pud) & _PAGE_RW;
+}
+
 /*
  * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
  *
index 6833ada..7b0bf82 100644 (file)
@@ -428,9 +428,21 @@ static int dev_dax_fault(struct vm_fault *vmf)
        return dev_dax_huge_fault(vmf, PE_SIZE_PTE);
 }
 
+static int dev_dax_split(struct vm_area_struct *vma, unsigned long addr)
+{
+       struct file *filp = vma->vm_file;
+       struct dev_dax *dev_dax = filp->private_data;
+       struct dax_region *dax_region = dev_dax->region;
+
+       if (!IS_ALIGNED(addr, dax_region->align))
+               return -EINVAL;
+       return 0;
+}
+
 static const struct vm_operations_struct dax_vm_ops = {
        .fault = dev_dax_fault,
        .huge_fault = dev_dax_huge_fault,
+       .split = dev_dax_split,
 };
 
 static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
index 21e60b1..130606c 100644 (file)
@@ -191,7 +191,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
        sg_list_start = umem->sg_head.sgl;
 
        while (npages) {
-               ret = get_user_pages(cur_base,
+               ret = get_user_pages_longterm(cur_base,
                                     min_t(unsigned long, npages,
                                           PAGE_SIZE / sizeof (struct page *)),
                                     gup_flags, page_list, vma_list);
index 0b5c43f..f412429 100644 (file)
@@ -185,12 +185,13 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma,
        dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n",
                data, size, dma->nr_pages);
 
-       err = get_user_pages(data & PAGE_MASK, dma->nr_pages,
+       err = get_user_pages_longterm(data & PAGE_MASK, dma->nr_pages,
                             flags, dma->pages, NULL);
 
        if (err != dma->nr_pages) {
                dma->nr_pages = (err >= 0) ? err : 0;
-               dprintk(1, "get_user_pages: err=%d [%d]\n", err, dma->nr_pages);
+               dprintk(1, "get_user_pages_longterm: err=%d [%d]\n", err,
+                       dma->nr_pages);
                return err < 0 ? err : -EINVAL;
        }
        return 0;
index d79ced9..82e8f6e 100644 (file)
@@ -281,8 +281,8 @@ static int autofs4_mount_wait(const struct path *path, bool rcu_walk)
                pr_debug("waiting for mount name=%pd\n", path->dentry);
                status = autofs4_wait(sbi, path, NFY_MOUNT);
                pr_debug("mount wait done status=%d\n", status);
-               ino->last_used = jiffies;
        }
+       ino->last_used = jiffies;
        return status;
 }
 
@@ -321,21 +321,16 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
         */
        if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
                struct dentry *parent = dentry->d_parent;
+               struct autofs_info *ino;
                struct dentry *new;
 
                new = d_lookup(parent, &dentry->d_name);
                if (!new)
                        return NULL;
-               if (new == dentry)
-                       dput(new);
-               else {
-                       struct autofs_info *ino;
-
-                       ino = autofs4_dentry_ino(new);
-                       ino->last_used = jiffies;
-                       dput(path->dentry);
-                       path->dentry = new;
-               }
+               ino = autofs4_dentry_ino(new);
+               ino->last_used = jiffies;
+               dput(path->dentry);
+               path->dentry = new;
        }
        return path->dentry;
 }
index 9598159..78b72c4 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -627,7 +627,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
 
                        if (pfn != pmd_pfn(*pmdp))
                                goto unlock_pmd;
-                       if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
+                       if (!pmd_dirty(*pmdp)
+                                       && !pmd_access_permitted(*pmdp, WRITE))
                                goto unlock_pmd;
 
                        flush_cache_page(vma, address, pfn);
index 1d6243d..6be2aa0 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1340,10 +1340,15 @@ void setup_new_exec(struct linux_binprm * bprm)
                 * avoid bad behavior from the prior rlimits. This has to
                 * happen before arch_pick_mmap_layout(), which examines
                 * RLIMIT_STACK, but after the point of no return to avoid
-                * needing to clean up the change on failure.
+                * races from other threads changing the limits. This also
+                * must be protected from races with prlimit() calls.
                 */
+               task_lock(current->group_leader);
                if (current->signal->rlim[RLIMIT_STACK].rlim_cur > _STK_LIM)
                        current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM;
+               if (current->signal->rlim[RLIMIT_STACK].rlim_max > _STK_LIM)
+                       current->signal->rlim[RLIMIT_STACK].rlim_max = _STK_LIM;
+               task_unlock(current->group_leader);
        }
 
        arch_pick_mmap_layout(current->mm);
index 016c46b..20a0a89 100644 (file)
@@ -779,7 +779,7 @@ static void __exit fat_destroy_inodecache(void)
 
 static int fat_remount(struct super_block *sb, int *flags, char *data)
 {
-       int new_rdonly;
+       bool new_rdonly;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        *flags |= SB_NODIRATIME | (sbi->options.isvfat ? 0 : SB_NOATIME);
 
index 1e76730..8a85f3f 100644 (file)
@@ -639,11 +639,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
                mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 
                /*
-                * page_put due to reference from alloc_huge_page()
                 * unlock_page because locked by add_to_page_cache()
+                * page_put due to reference from alloc_huge_page()
                 */
-               put_page(page);
                unlock_page(page);
+               put_page(page);
        }
 
        if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
index d818fd2..b8b8b9c 100644 (file)
@@ -269,6 +269,9 @@ static unsigned long mb_cache_count(struct shrinker *shrink,
        struct mb_cache *cache = container_of(shrink, struct mb_cache,
                                              c_shrink);
 
+       /* Unlikely, but not impossible */
+       if (unlikely(cache->c_entry_count < 0))
+               return 0;
        return cache->c_entry_count;
 }
 
index f0c7a7b..9cc91fb 100644 (file)
@@ -1129,18 +1129,9 @@ static int follow_automount(struct path *path, struct nameidata *nd,
         * of the daemon to instantiate them before they can be used.
         */
        if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
-                          LOOKUP_OPEN | LOOKUP_CREATE |
-                          LOOKUP_AUTOMOUNT))) {
-               /* Positive dentry that isn't meant to trigger an
-                * automount, EISDIR will allow it to be used,
-                * otherwise there's no mount here "now" so return
-                * ENOENT.
-                */
-               if (path->dentry->d_inode)
-                       return -EISDIR;
-               else
-                       return -ENOENT;
-       }
+                          LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
+           path->dentry->d_inode)
+               return -EISDIR;
 
        if (path->dentry->d_sb->s_user_ns != &init_user_ns)
                return -EACCES;
index 757dc6f..b234d54 100644 (file)
@@ -805,15 +805,23 @@ static inline int pmd_trans_huge(pmd_t pmd)
 {
        return 0;
 }
-#ifndef __HAVE_ARCH_PMD_WRITE
+#ifndef pmd_write
 static inline int pmd_write(pmd_t pmd)
 {
        BUG();
        return 0;
 }
-#endif /* __HAVE_ARCH_PMD_WRITE */
+#endif /* pmd_write */
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+#ifndef pud_write
+static inline int pud_write(pud_t pud)
+{
+       BUG();
+       return 0;
+}
+#endif /* pud_write */
+
 #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
        (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
         !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
index bbd92da..511fbaa 100644 (file)
@@ -3088,7 +3088,8 @@ static inline int vfs_lstat(const char __user *name, struct kstat *stat)
 static inline int vfs_fstatat(int dfd, const char __user *filename,
                              struct kstat *stat, int flags)
 {
-       return vfs_statx(dfd, filename, flags, stat, STATX_BASIC_STATS);
+       return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT,
+                        stat, STATX_BASIC_STATS);
 }
 static inline int vfs_fstat(int fd, struct kstat *stat)
 {
@@ -3194,6 +3195,20 @@ static inline bool vma_is_dax(struct vm_area_struct *vma)
        return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
 }
 
+static inline bool vma_is_fsdax(struct vm_area_struct *vma)
+{
+       struct inode *inode;
+
+       if (!vma->vm_file)
+               return false;
+       if (!vma_is_dax(vma))
+               return false;
+       inode = file_inode(vma->vm_file);
+       if (inode->i_mode == S_IFCHR)
+               return false; /* device-dax */
+       return true;
+}
+
 static inline int iocb_flags(struct file *file)
 {
        int res = 0;
index fbf5b31..82a2588 100644 (file)
@@ -239,14 +239,6 @@ static inline int pgd_write(pgd_t pgd)
 }
 #endif
 
-#ifndef pud_write
-static inline int pud_write(pud_t pud)
-{
-       BUG();
-       return 0;
-}
-#endif
-
 #define HUGETLB_ANON_FILE "anon_hugepage"
 
 enum {
index 895ec0c..a2246cf 100644 (file)
@@ -54,7 +54,7 @@ static inline struct page *new_page_nodemask(struct page *page,
        new_page = __alloc_pages_nodemask(gfp_mask, order,
                                preferred_nid, nodemask);
 
-       if (new_page && PageTransHuge(page))
+       if (new_page && PageTransHuge(new_page))
                prep_transhuge_page(new_page);
 
        return new_page;
index ee07314..ea818ff 100644 (file)
@@ -377,6 +377,7 @@ enum page_entry_size {
 struct vm_operations_struct {
        void (*open)(struct vm_area_struct * area);
        void (*close)(struct vm_area_struct * area);
+       int (*split)(struct vm_area_struct * area, unsigned long addr);
        int (*mremap)(struct vm_area_struct * area);
        int (*fault)(struct vm_fault *vmf);
        int (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size);
@@ -1379,6 +1380,19 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages, int *locked);
 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                    struct page **pages, unsigned int gup_flags);
+#ifdef CONFIG_FS_DAX
+long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
+                           unsigned int gup_flags, struct page **pages,
+                           struct vm_area_struct **vmas);
+#else
+static inline long get_user_pages_longterm(unsigned long start,
+               unsigned long nr_pages, unsigned int gup_flags,
+               struct page **pages, struct vm_area_struct **vmas)
+{
+       return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+}
+#endif /* CONFIG_FS_DAX */
+
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
                        struct page **pages);
 
index 2f98df0..297c723 100644 (file)
@@ -53,6 +53,18 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
                ret = -EFAULT;
                goto out;
        }
+
+       /*
+        * While get_vaddr_frames() could be used for transient (kernel
+        * controlled lifetime) pinning of memory pages all current
+        * users establish long term (userspace controlled lifetime)
+        * page pinning. Treat get_vaddr_frames() like
+        * get_user_pages_longterm() and disallow it for filesystem-dax
+        * mappings.
+        */
+       if (vma_is_fsdax(vma))
+               return -EOPNOTSUPP;
+
        if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
                vec->got_ref = true;
                vec->is_pfns = false;
index dfcde13..d3fb60e 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -66,7 +66,7 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
  */
 static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
 {
-       return pte_write(pte) ||
+       return pte_access_permitted(pte, WRITE) ||
                ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
 }
 
@@ -1095,6 +1095,70 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
 }
 EXPORT_SYMBOL(get_user_pages);
 
+#ifdef CONFIG_FS_DAX
+/*
+ * This is the same as get_user_pages() in that it assumes we are
+ * operating on the current task's mm, but it goes further to validate
+ * that the vmas associated with the address range are suitable for
+ * longterm elevated page reference counts. For example, filesystem-dax
+ * mappings are subject to the lifetime enforced by the filesystem and
+ * we need guarantees that longterm users like RDMA and V4L2 only
+ * establish mappings that have a kernel enforced revocation mechanism.
+ *
+ * "longterm" == userspace controlled elevated page count lifetime.
+ * Contrast this to iov_iter_get_pages() usages which are transient.
+ */
+long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
+               unsigned int gup_flags, struct page **pages,
+               struct vm_area_struct **vmas_arg)
+{
+       struct vm_area_struct **vmas = vmas_arg;
+       struct vm_area_struct *vma_prev = NULL;
+       long rc, i;
+
+       if (!pages)
+               return -EINVAL;
+
+       if (!vmas) {
+               vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
+                              GFP_KERNEL);
+               if (!vmas)
+                       return -ENOMEM;
+       }
+
+       rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+
+       for (i = 0; i < rc; i++) {
+               struct vm_area_struct *vma = vmas[i];
+
+               if (vma == vma_prev)
+                       continue;
+
+               vma_prev = vma;
+
+               if (vma_is_fsdax(vma))
+                       break;
+       }
+
+       /*
+        * Either get_user_pages() failed, or the vma validation
+        * succeeded, in either case we don't need to put_page() before
+        * returning.
+        */
+       if (i >= rc)
+               goto out;
+
+       for (i = 0; i < rc; i++)
+               put_page(pages[i]);
+       rc = -EOPNOTSUPP;
+out:
+       if (vmas != vmas_arg)
+               kfree(vmas);
+       return rc;
+}
+EXPORT_SYMBOL(get_user_pages_longterm);
+#endif /* CONFIG_FS_DAX */
+
 /**
  * populate_vma_page_range() -  populate a range of pages in the vma.
  * @vma:   target vma
index ea19742..3a5c172 100644 (file)
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -391,11 +391,11 @@ again:
                if (pmd_protnone(pmd))
                        return hmm_vma_walk_clear(start, end, walk);
 
-               if (write_fault && !pmd_write(pmd))
+               if (!pmd_access_permitted(pmd, write_fault))
                        return hmm_vma_walk_clear(start, end, walk);
 
                pfn = pmd_pfn(pmd) + pte_index(addr);
-               flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0;
+               flag |= pmd_access_permitted(pmd, WRITE) ? HMM_PFN_WRITE : 0;
                for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
                        pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag;
                return 0;
@@ -456,11 +456,11 @@ again:
                        continue;
                }
 
-               if (write_fault && !pte_write(pte))
+               if (!pte_access_permitted(pte, write_fault))
                        goto fault;
 
                pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag;
-               pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0;
+               pfns[i] |= pte_access_permitted(pte, WRITE) ? HMM_PFN_WRITE : 0;
                continue;
 
 fault:
index 0e7ded9..2f2f5e7 100644 (file)
@@ -870,7 +870,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
         */
        WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
 
-       if (flags & FOLL_WRITE && !pmd_write(*pmd))
+       if (!pmd_access_permitted(*pmd, flags & FOLL_WRITE))
                return NULL;
 
        if (pmd_present(*pmd) && pmd_devmap(*pmd))
@@ -1012,7 +1012,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
 
        assert_spin_locked(pud_lockptr(mm, pud));
 
-       if (flags & FOLL_WRITE && !pud_write(*pud))
+       if (!pud_access_permitted(*pud, flags & FOLL_WRITE))
                return NULL;
 
        if (pud_present(*pud) && pud_devmap(*pud))
@@ -1386,7 +1386,7 @@ out_unlock:
  */
 static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
 {
-       return pmd_write(pmd) ||
+       return pmd_access_permitted(pmd, WRITE) ||
               ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
 }
 
index 681b300..9a334f5 100644 (file)
@@ -3125,6 +3125,13 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
        }
 }
 
+static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
+{
+       if (addr & ~(huge_page_mask(hstate_vma(vma))))
+               return -EINVAL;
+       return 0;
+}
+
 /*
  * We cannot handle pagefaults against hugetlb pages at all.  They cause
  * handle_mm_fault() to try to instantiate regular-sized pages in the
@@ -3141,6 +3148,7 @@ const struct vm_operations_struct hugetlb_vm_ops = {
        .fault = hugetlb_vm_op_fault,
        .open = hugetlb_vm_op_open,
        .close = hugetlb_vm_op_close,
+       .split = hugetlb_vm_op_split,
 };
 
 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -4627,7 +4635,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
        pte_t *pte = NULL;
 
        pgd = pgd_offset(mm, addr);
-       p4d = p4d_offset(pgd, addr);
+       p4d = p4d_alloc(mm, pgd, addr);
+       if (!p4d)
+               return NULL;
        pud = pud_alloc(mm, p4d, addr);
        if (pud) {
                if (sz == PUD_SIZE) {
index e4738d5..3d47817 100644 (file)
@@ -1523,6 +1523,8 @@ static void kmemleak_scan(void)
                        if (page_count(page) == 0)
                                continue;
                        scan_block(page, page + 1, NULL);
+                       if (!(pfn % (MAX_SCAN_SIZE / sizeof(*page))))
+                               cond_resched();
                }
        }
        put_online_mems();
index 375cf32..751e97a 100644 (file)
@@ -276,15 +276,14 @@ static long madvise_willneed(struct vm_area_struct *vma,
 {
        struct file *file = vma->vm_file;
 
+       *prev = vma;
 #ifdef CONFIG_SWAP
        if (!file) {
-               *prev = vma;
                force_swapin_readahead(vma, start, end);
                return 0;
        }
 
        if (shmem_mapping(file->f_mapping)) {
-               *prev = vma;
                force_shm_swapin_readahead(vma, start, end,
                                        file->f_mapping);
                return 0;
@@ -299,7 +298,6 @@ static long madvise_willneed(struct vm_area_struct *vma,
                return 0;
        }
 
-       *prev = vma;
        start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
        if (end > vma->vm_end)
                end = vma->vm_end;
index 50e6906..ac2ffd5 100644 (file)
@@ -6044,7 +6044,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
        memcg_check_events(memcg, page);
 
        if (!mem_cgroup_is_root(memcg))
-               css_put(&memcg->css);
+               css_put_many(&memcg->css, nr_entries);
 }
 
 /**
index 85e7a87..5eb3d25 100644 (file)
@@ -3948,7 +3948,7 @@ static int handle_pte_fault(struct vm_fault *vmf)
        if (unlikely(!pte_same(*vmf->pte, entry)))
                goto unlock;
        if (vmf->flags & FAULT_FLAG_WRITE) {
-               if (!pte_write(entry))
+               if (!pte_access_permitted(entry, WRITE))
                        return do_wp_page(vmf);
                entry = pte_mkdirty(entry);
        }
@@ -4013,7 +4013,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 
                        /* NUMA case for anonymous PUDs would go here */
 
-                       if (dirty && !pud_write(orig_pud)) {
+                       if (dirty && !pud_access_permitted(orig_pud, WRITE)) {
                                ret = wp_huge_pud(&vmf, orig_pud);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
@@ -4046,7 +4046,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                        if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
                                return do_huge_pmd_numa_page(&vmf, orig_pmd);
 
-                       if (dirty && !pmd_write(orig_pmd)) {
+                       if (dirty && !pmd_access_permitted(orig_pmd, WRITE)) {
                                ret = wp_huge_pmd(&vmf, orig_pmd);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
@@ -4336,7 +4336,7 @@ int follow_phys(struct vm_area_struct *vma,
                goto out;
        pte = *ptep;
 
-       if ((flags & FOLL_WRITE) && !pte_write(pte))
+       if (!pte_access_permitted(pte, flags & FOLL_WRITE))
                goto unlock;
 
        *prot = pgprot_val(pte_pgprot(pte));
index 924839f..a4d5468 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2555,9 +2555,11 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
        struct vm_area_struct *new;
        int err;
 
-       if (is_vm_hugetlb_page(vma) && (addr &
-                                       ~(huge_page_mask(hstate_vma(vma)))))
-               return -EINVAL;
+       if (vma->vm_ops && vma->vm_ops->split) {
+               err = vma->vm_ops->split(vma, addr);
+               if (err)
+                       return err;
+       }
 
        new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
        if (!new)
index c86fbd1..c957be3 100644 (file)
@@ -550,7 +550,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
         */
        set_bit(MMF_UNSTABLE, &mm->flags);
 
-       tlb_gather_mmu(&tlb, mm, 0, -1);
        for (vma = mm->mmap ; vma; vma = vma->vm_next) {
                if (!can_madv_dontneed_vma(vma))
                        continue;
@@ -565,11 +564,13 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
                 * we do not want to block exit_mmap by keeping mm ref
                 * count elevated without a good reason.
                 */
-               if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
+               if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
+                       tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
                        unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
                                         NULL);
+                       tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
+               }
        }
-       tlb_finish_mmu(&tlb, 0, -1);
        pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
                        task_pid_nr(tsk), tsk->comm,
                        K(get_mm_counter(mm, MM_ANONPAGES)),
index e709503..586f312 100644 (file)
@@ -433,11 +433,8 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
        else
                bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
 
-       if (unlikely(bg_thresh >= thresh)) {
-               pr_warn("vm direct limit must be set greater than background limit.\n");
+       if (bg_thresh >= thresh)
                bg_thresh = thresh / 2;
-       }
-
        tsk = current;
        if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
                bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
index d4096f4..73f5d45 100644 (file)
@@ -2507,10 +2507,6 @@ void drain_all_pages(struct zone *zone)
        if (WARN_ON_ONCE(!mm_percpu_wq))
                return;
 
-       /* Workqueues cannot recurse */
-       if (current->flags & PF_WQ_WORKER)
-               return;
-
        /*
         * Do not drain if one is already in progress unless it's specific to
         * a zone. Such callers are primarily CMA and memory hotplug and need
@@ -7656,11 +7652,18 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 
        /*
         * In case of -EBUSY, we'd like to know which page causes problem.
-        * So, just fall through. We will check it in test_pages_isolated().
+        * So, just fall through. test_pages_isolated() has a tracepoint
+        * which will report the busy page.
+        *
+        * It is possible that busy pages could become available before
+        * the call to test_pages_isolated, and the range will actually be
+        * allocated.  So, if we fall through be sure to clear ret so that
+        * -EBUSY is not accidentally used or returned to caller.
         */
        ret = __alloc_contig_migrate_range(&cc, start, end);
        if (ret && ret != -EBUSY)
                goto done;
+       ret =0;
 
        /*
         * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
index 6f099f9..94b6648 100755 (executable)
@@ -83,8 +83,11 @@ def print_result(symboltype, symbolformat, argc):
     for d, n in delta:
         if d: print("%-40s %7s %7s %+7d" % (n, old.get(n,"-"), new.get(n,"-"), d))
 
-    print("Total: Before=%d, After=%d, chg %+.2f%%" % \
-        (otot, ntot, (ntot - otot)*100.0/otot))
+    if otot:
+        percent = (ntot - otot) * 100.0 / otot
+    else:
+        percent = 0
+    print("Total: Before=%d, After=%d, chg %+.2f%%" % (otot, ntot, percent))
 
 if sys.argv[1] == "-c":
     print_result("Function", "tT", 3)
index 1f5ce95..39e07d8 100755 (executable)
 set -o errexit
 set -o nounset
 
+READELF="${CROSS_COMPILE}readelf"
+ADDR2LINE="${CROSS_COMPILE}addr2line"
+SIZE="${CROSS_COMPILE}size"
+NM="${CROSS_COMPILE}nm"
+
 command -v awk >/dev/null 2>&1 || die "awk isn't installed"
-command -v readelf >/dev/null 2>&1 || die "readelf isn't installed"
-command -v addr2line >/dev/null 2>&1 || die "addr2line isn't installed"
+command -v ${READELF} >/dev/null 2>&1 || die "readelf isn't installed"
+command -v ${ADDR2LINE} >/dev/null 2>&1 || die "addr2line isn't installed"
+command -v ${SIZE} >/dev/null 2>&1 || die "size isn't installed"
+command -v ${NM} >/dev/null 2>&1 || die "nm isn't installed"
 
 usage() {
        echo "usage: faddr2line <object file> <func+offset> <func+offset>..." >&2
@@ -69,10 +76,10 @@ die() {
 find_dir_prefix() {
        local objfile=$1
 
-       local start_kernel_addr=$(readelf -sW $objfile | awk '$8 == "start_kernel" {printf "0x%s", $2}')
+       local start_kernel_addr=$(${READELF} -sW $objfile | awk '$8 == "start_kernel" {printf "0x%s", $2}')
        [[ -z $start_kernel_addr ]] && return
 
-       local file_line=$(addr2line -e $objfile $start_kernel_addr)
+       local file_line=$(${ADDR2LINE} -e $objfile $start_kernel_addr)
        [[ -z $file_line ]] && return
 
        local prefix=${file_line%init/main.c:*}
@@ -104,7 +111,7 @@ __faddr2line() {
 
        # Go through each of the object's symbols which match the func name.
        # In rare cases there might be duplicates.
-       file_end=$(size -Ax $objfile | awk '$1 == ".text" {print $2}')
+       file_end=$(${SIZE} -Ax $objfile | awk '$1 == ".text" {print $2}')
        while read symbol; do
                local fields=($symbol)
                local sym_base=0x${fields[0]}
@@ -156,10 +163,10 @@ __faddr2line() {
 
                # pass real address to addr2line
                echo "$func+$offset/$sym_size:"
-               addr2line -fpie $objfile $addr | sed "s; $dir_prefix\(\./\)*; ;"
+               ${ADDR2LINE} -fpie $objfile $addr | sed "s; $dir_prefix\(\./\)*; ;"
                DONE=1
 
-       done < <(nm -n $objfile | awk -v fn=$func -v end=$file_end '$3 == fn { found=1; line=$0; start=$1; next } found == 1 { found=0; print line, "0x"$1 } END {if (found == 1) print line, end; }')
+       done < <(${NM} -n $objfile | awk -v fn=$func -v end=$file_end '$3 == fn { found=1; line=$0; start=$1; next } found == 1 { found=0; print line, "0x"$1 } END {if (found == 1) print line, end; }')
 }
 
 [[ $# -lt 2 ]] && usage