From 9119f76d77286f1d46afbd10eaab0adfb53c8d59 Mon Sep 17 00:00:00 2001 From: Philippe Gerum Date: Thu, 7 Dec 2017 17:04:43 +0100 Subject: [PATCH] mm: ipipe: disable ondemand memory --- include/linux/sched/coredump.h | 1 + lib/ioremap.c | 8 ++- mm/memory.c | 97 ++++++++++++++++++++++++++++++++-- mm/mlock.c | 24 +++++++++ mm/mprotect.c | 11 +++- mm/vmalloc.c | 2 + 6 files changed, 136 insertions(+), 7 deletions(-) diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index ec912d01126f..07d34a37f374 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -72,6 +72,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ #define MMF_OOM_VICTIM 25 /* mm is the oom victim */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) +#define MMF_VM_PINNED 31 /* ondemand load up and COW disabled */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ MMF_DISABLE_THP_MASK) diff --git a/lib/ioremap.c b/lib/ioremap.c index b808a390e4c3..b4f3391c5324 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -175,7 +176,12 @@ int ioremap_page_range(unsigned long addr, break; } while (pgd++, addr = next, addr != end); - flush_cache_vmap(start, end); + /* APEI may invoke this for temporarily remapping pages in interrupt + * context - nothing we can and need to propagate globally. */ + if (!in_interrupt()) { + __ipipe_pin_mapping_globally(start, end); + flush_cache_vmap(start, end); + } return err; } diff --git a/mm/memory.c b/mm/memory.c index a728bed16c20..d8408ad0d7bb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include @@ -129,6 +130,11 @@ EXPORT_SYMBOL(zero_pfn); unsigned long highest_memmap_pfn __read_mostly; +static inline void cow_user_page(struct page *dst, + struct page *src, + unsigned long va, + struct vm_area_struct *vma); + /* * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() */ @@ -939,8 +945,8 @@ out: static inline unsigned long copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, - unsigned long addr, int *rss) + pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, + unsigned long addr, int *rss, struct page *uncow_page) { unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; @@ -1018,6 +1024,21 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * in the parent and the child */ if (is_cow_mapping(vm_flags)) { +#ifdef CONFIG_IPIPE + if (uncow_page) { + struct page *old_page = vm_normal_page(vma, addr, pte); + cow_user_page(uncow_page, old_page, addr, vma); + pte = mk_pte(uncow_page, vma->vm_page_prot); + + if (vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + + page_add_new_anon_rmap(uncow_page, vma, addr, false); + rss[!!PageAnon(uncow_page)]++; + goto out_set_pte; + } +#endif /* CONFIG_IPIPE */ ptep_set_wrprotect(src_mm, addr, src_pte); pte = pte_wrprotect(pte); } @@ -1065,13 +1086,27 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, int progress = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; - + struct page *uncow_page = NULL; +#ifdef CONFIG_IPIPE + int do_cow_break = 0; again: + if (do_cow_break) { + uncow_page = alloc_page_vma(GFP_HIGHUSER, vma, addr); + if (uncow_page == NULL) + return -ENOMEM; + do_cow_break = 0; + } +#else +again: +#endif init_rss_vec(rss); dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); - if (!dst_pte) + if (!dst_pte) { + if (uncow_page) + put_page(uncow_page); return -ENOMEM; + } src_pte = pte_offset_map(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); @@ -1094,8 +1129,25 @@ again: progress++; continue; } +#ifdef CONFIG_IPIPE + if (likely(uncow_page == NULL) && likely(pte_present(*src_pte))) { + if (is_cow_mapping(vma->vm_flags) && + test_bit(MMF_VM_PINNED, &src_mm->flags) && + ((vma->vm_flags|src_mm->def_flags) & VM_LOCKED)) { + arch_leave_lazy_mmu_mode(); + spin_unlock(src_ptl); + pte_unmap(src_pte); + add_mm_rss_vec(dst_mm, rss); + pte_unmap_unlock(dst_pte, dst_ptl); + cond_resched(); + do_cow_break = 1; + goto again; + } + } +#endif entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, - vma, addr, rss); + vma, addr, rss, uncow_page); + uncow_page = NULL; if (entry.val) break; progress += 8; @@ -4642,6 +4694,41 @@ long copy_huge_page_from_user(struct page *dst_page, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ +#ifdef CONFIG_IPIPE + +int __ipipe_disable_ondemand_mappings(struct task_struct *tsk) +{ + struct vm_area_struct *vma; + struct mm_struct *mm; + int result = 0; + + mm = get_task_mm(tsk); + if (!mm) + return -EPERM; + + down_write(&mm->mmap_sem); + if (test_bit(MMF_VM_PINNED, &mm->flags)) + goto done_mm; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (is_cow_mapping(vma->vm_flags) && + (vma->vm_flags & VM_WRITE)) { + result = __ipipe_pin_vma(mm, vma); + if (result < 0) + goto done_mm; + } + } + set_bit(MMF_VM_PINNED, &mm->flags); + + done_mm: + up_write(&mm->mmap_sem); + mmput(mm); + return result; +} +EXPORT_SYMBOL_GPL(__ipipe_disable_ondemand_mappings); + +#endif /* CONFIG_IPIPE */ + #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS static struct kmem_cache *page_ptl_cachep; diff --git a/mm/mlock.c b/mm/mlock.c index 46af369c13e5..392a7567b577 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -864,3 +864,27 @@ void user_shm_unlock(size_t size, struct user_struct *user) spin_unlock(&shmlock_user_lock); free_uid(user); } + +#ifdef CONFIG_IPIPE +int __ipipe_pin_vma(struct mm_struct *mm, struct vm_area_struct *vma) +{ + int ret, write, len; + + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + return 0; + + if (!((vma->vm_flags & VM_DONTEXPAND) || + is_vm_hugetlb_page(vma) || vma == get_gate_vma(mm))) { + ret = populate_vma_page_range(vma, vma->vm_start, vma->vm_end, + NULL); + return ret < 0 ? ret : 0; + } + + write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; + len = DIV_ROUND_UP(vma->vm_end, PAGE_SIZE) - vma->vm_start/PAGE_SIZE; + ret = get_user_pages(vma->vm_start, len, write, 0, NULL); + if (ret < 0) + return ret; + return ret == len ? 0 : -EFAULT; +} +#endif diff --git a/mm/mprotect.c b/mm/mprotect.c index 58b629bb70de..64066f97d96d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -41,7 +42,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; - unsigned long pages = 0; + unsigned long pages = 0, flags; int target_node = NUMA_NO_NODE; /* @@ -96,6 +97,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, continue; } + flags = hard_local_irq_save(); ptent = ptep_modify_prot_start(mm, addr, pte); ptent = pte_modify(ptent, newprot); if (preserve_write) @@ -108,6 +110,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ptent = pte_mkwrite(ptent); } ptep_modify_prot_commit(mm, addr, pte, ptent); + hard_local_irq_restore(flags); pages++; } else if (IS_ENABLED(CONFIG_MIGRATION)) { swp_entry_t entry = pte_to_swp_entry(oldpte); @@ -288,6 +291,12 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, pages = hugetlb_change_protection(vma, start, end, newprot); else pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); +#ifdef CONFIG_IPIPE + if (test_bit(MMF_VM_PINNED, &vma->vm_mm->flags) && + ((vma->vm_flags | vma->vm_mm->def_flags) & VM_LOCKED) && + (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))) + __ipipe_pin_vma(vma->vm_mm, vma); +#endif return pages; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 673942094328..0ca2a331e3ac 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -232,6 +232,8 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end, return err; } while (pgd++, addr = next, addr != end); + __ipipe_pin_mapping_globally(start, end); + return nr; } -- 2.34.1