From 2692ba0419164c474198b41de45e8958df41e715 Mon Sep 17 00:00:00 2001 From: Liam Mark Date: Tue, 4 May 2021 18:37:25 -0700 Subject: [PATCH 01/16] mm: cma: add trace events for CMA alloc perf testing MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add cma and migrate trace events to enable CMA allocation performance to be measured via ftrace. [georgi.djakov@linaro.org: add the CMA instance name to the cma_alloc_start trace event] Link: https://lkml.kernel.org/r/20210326155414.25006-1-georgi.djakov@linaro.org Link: https://lkml.kernel.org/r/20210324160740.15901-1-georgi.djakov@linaro.org Signed-off-by: Liam Mark Signed-off-by: Georgi Djakov Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Origin: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=7bc1aec5e28765ad18742824b3b972471807a632 Signed-off-by: Łukasz Stelmach Change-Id: I61b2cdb2a7416c69fc577080c08be50255921305 --- include/trace/events/cma.h | 42 +++++++++++++++++++++++++++++++++++++++++- include/trace/events/migrate.h | 22 ++++++++++++++++++++++ mm/cma.c | 4 ++++ mm/migrate.c | 2 ++ 4 files changed, 69 insertions(+), 1 deletion(-) diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index 5017a88..be1525a 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -8,7 +8,7 @@ #include #include -TRACE_EVENT(cma_alloc, +DECLARE_EVENT_CLASS(cma_alloc_class, TP_PROTO(unsigned long pfn, const struct page *page, unsigned int count, unsigned int align), @@ -61,6 +61,46 @@ TRACE_EVENT(cma_release, __entry->count) ); +TRACE_EVENT(cma_alloc_start, + + TP_PROTO(const char *name, unsigned int count, unsigned int align), + + TP_ARGS(name, count, align), + + TP_STRUCT__entry( + __string(name, name) + __field(unsigned int, count) + __field(unsigned int, align) + ), + + TP_fast_assign( + __assign_str(name, name); + __entry->count = count; + __entry->align = align; + ), + + TP_printk("name=%s count=%u align=%u", + __get_str(name), + __entry->count, + __entry->align) +); + +DEFINE_EVENT(cma_alloc_class, cma_alloc, + + TP_PROTO(unsigned long pfn, const struct page *page, + unsigned int count, unsigned int align), + + TP_ARGS(pfn, page, count, align) +); + +DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry, + + TP_PROTO(unsigned long pfn, const struct page *page, + unsigned int count, unsigned int align), + + TP_ARGS(pfn, page, count, align) +); + #endif /* _TRACE_CMA_H */ /* This part must be outside protection */ diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 4d43439..f2c99060 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -81,6 +81,28 @@ TRACE_EVENT(mm_migrate_pages, __print_symbolic(__entry->mode, MIGRATE_MODE), __print_symbolic(__entry->reason, MIGRATE_REASON)) ); + +TRACE_EVENT(mm_migrate_pages_start, + + TP_PROTO(enum migrate_mode mode, int reason), + + TP_ARGS(mode, reason), + + TP_STRUCT__entry( + __field(enum migrate_mode, mode) + __field(int, reason) + ), + + TP_fast_assign( + __entry->mode = mode; + __entry->reason = reason; + ), + + TP_printk("mode=%s reason=%s", + __print_symbolic(__entry->mode, MIGRATE_MODE), + __print_symbolic(__entry->reason, MIGRATE_REASON)) +); + #endif /* _TRACE_MIGRATE_H */ /* This part must be outside protection */ diff --git a/mm/cma.c b/mm/cma.c index 18046bf..342b82d 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -428,6 +428,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, if (!count) goto out; + trace_cma_alloc_start(cma->name, count, align); + mask = cma_bitmap_aligned_mask(cma, align); offset = cma_bitmap_aligned_offset(cma, align); bitmap_maxno = cma_bitmap_maxno(cma); @@ -469,6 +471,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, pr_debug("%s(): memory range at %p is busy, retrying\n", __func__, pfn_to_page(pfn)); + + trace_cma_alloc_busy_retry(pfn, pfn_to_page(pfn), count, align); /* try again with a bit different memory target */ start = bitmap_no + mask + 1; } diff --git a/mm/migrate.c b/mm/migrate.c index 9d7ca1b..ba56339 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1430,6 +1430,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, int swapwrite = current->flags & PF_SWAPWRITE; int rc, nr_subpages; + trace_mm_migrate_pages_start(mode, reason); + if (!swapwrite) current->flags |= PF_SWAPWRITE; -- 2.7.4 From 494b93600055204610f46f5766990e23ed0816b6 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:37:31 -0700 Subject: [PATCH 02/16] mm: cma: add the CMA instance name to cma trace events MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit There were missing places to add cma instance name. To identify each CMA instance, let's add the name for every cma trace. This patch also changes the existing cma_trace_alloc to cma_trace_finish since we have cma_alloc_start[1]. [1] https://lore.kernel.org/linux-mm/20210324160740.15901-1-georgi.djakov@linaro.org Link: https://lkml.kernel.org/r/20210330220237.748899-1-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Liam Mark Cc: Georgi Djakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Origin: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=3aab8ae7aace3388da319a233edf48f0f5d26a44 Signed-off-by: Łukasz Stelmach Change-Id: I94fb2a7925f56e467c954f4d07d492cbd1b1c4e4 --- include/trace/events/cma.h | 28 +++++++++++++++++----------- mm/cma.c | 7 ++++--- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index be1525a..5cf385a 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -10,12 +10,13 @@ DECLARE_EVENT_CLASS(cma_alloc_class, - TP_PROTO(unsigned long pfn, const struct page *page, + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned int count, unsigned int align), - TP_ARGS(pfn, page, count, align), + TP_ARGS(name, pfn, page, count, align), TP_STRUCT__entry( + __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) __field(unsigned int, count) @@ -23,13 +24,15 @@ DECLARE_EVENT_CLASS(cma_alloc_class, ), TP_fast_assign( + __assign_str(name, name); __entry->pfn = pfn; __entry->page = page; __entry->count = count; __entry->align = align; ), - TP_printk("pfn=%lx page=%p count=%u align=%u", + TP_printk("name=%s pfn=%lx page=%p count=%u align=%u", + __get_str(name), __entry->pfn, __entry->page, __entry->count, @@ -38,24 +41,27 @@ DECLARE_EVENT_CLASS(cma_alloc_class, TRACE_EVENT(cma_release, - TP_PROTO(unsigned long pfn, const struct page *page, + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned int count), - TP_ARGS(pfn, page, count), + TP_ARGS(name, pfn, page, count), TP_STRUCT__entry( + __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) __field(unsigned int, count) ), TP_fast_assign( + __assign_str(name, name); __entry->pfn = pfn; __entry->page = page; __entry->count = count; ), - TP_printk("pfn=%lx page=%p count=%u", + TP_printk("name=%s pfn=%lx page=%p count=%u", + __get_str(name), __entry->pfn, __entry->page, __entry->count) @@ -85,20 +91,20 @@ TRACE_EVENT(cma_alloc_start, __entry->align) ); -DEFINE_EVENT(cma_alloc_class, cma_alloc, +DEFINE_EVENT(cma_alloc_class, cma_alloc_finish, - TP_PROTO(unsigned long pfn, const struct page *page, + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned int count, unsigned int align), - TP_ARGS(pfn, page, count, align) + TP_ARGS(name, pfn, page, count, align) ); DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry, - TP_PROTO(unsigned long pfn, const struct page *page, + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned int count, unsigned int align), - TP_ARGS(pfn, page, count, align) + TP_ARGS(name, pfn, page, count, align) ); #endif /* _TRACE_CMA_H */ diff --git a/mm/cma.c b/mm/cma.c index 342b82d..9f4179a 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -472,12 +472,13 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, pr_debug("%s(): memory range at %p is busy, retrying\n", __func__, pfn_to_page(pfn)); - trace_cma_alloc_busy_retry(pfn, pfn_to_page(pfn), count, align); + trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn), + count, align); /* try again with a bit different memory target */ start = bitmap_no + mask + 1; } - trace_cma_alloc(pfn, page, count, align); + trace_cma_alloc_finish(cma->name, pfn, page, count, align); /* * CMA can allocate multiple page blocks, which results in different @@ -533,7 +534,7 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) free_contig_range(pfn, count); cma_clear_bitmap(cma, pfn, count); - trace_cma_release(pfn, pages, count); + trace_cma_release(cma->name, pfn, pages, count); return true; } -- 2.7.4 From 80a582d1f8d92dab248183a84ed7fb1cf2f4ee68 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 21 Apr 2021 14:44:12 +1000 Subject: [PATCH 03/16] mm: cma: support sysfs MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Since CMA is getting used more widely, it's more important to keep monitoring CMA statistics for system health since it's directly related to user experience. This patch introduces sysfs statistics for CMA, in order to provide some basic monitoring of the CMA allocator. * the number of CMA page successful allocations * the number of CMA page allocation failures These two values allow the user to calcuate the allocation failure rate for each CMA area. e.g.) /sys/kernel/mm/cma/WIFI/alloc_pages_[success|fail] /sys/kernel/mm/cma/SENSOR/alloc_pages_[success|fail] /sys/kernel/mm/cma/BLUETOOTH/alloc_pages_[success|fail] The cma_stat was intentionally allocated by dynamic allocation to harmonize with kobject lifetime management. https://lore.kernel.org/linux-mm/YCOAmXqt6dZkCQYs@kroah.com/ Link: https://lkml.kernel.org/r/20210324230759.2213957-1-minchan@kernel.org Link: https://lore.kernel.org/linux-mm/20210316100433.17665-1-colin.king@canonical.com/ Signed-off-by: Minchan Kim Signed-off-by: Colin Ian King Tested-by: Dmitry Osipenko Reviewed-by: Dmitry Osipenko Reviewed-by: Greg Kroah-Hartman Reviewed-by: John Hubbard Tested-by: Anders Roxell Cc: Suren Baghdasaryan Cc: John Dias Cc: Matthew Wilcox (Oracle) Cc: Colin Ian King Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell Origin: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=43ca106fa8ec7d684776fbe561214d3b2b7cb9cb Signed-off-by: Łukasz Stelmach Change-Id: I80c0f2c8d60d250418002260b0a20e5a796a9823 --- Documentation/ABI/testing/sysfs-kernel-mm-cma | 25 ++++++ mm/Kconfig | 7 ++ mm/Makefile | 1 + mm/cma.c | 8 +- mm/cma.h | 23 ++++++ mm/cma_sysfs.c | 112 ++++++++++++++++++++++++++ 6 files changed, 174 insertions(+), 2 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-cma create mode 100644 mm/cma_sysfs.c diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cma b/Documentation/ABI/testing/sysfs-kernel-mm-cma new file mode 100644 index 0000000..02b2bb6 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-cma @@ -0,0 +1,25 @@ +What: /sys/kernel/mm/cma/ +Date: Feb 2021 +Contact: Minchan Kim +Description: + /sys/kernel/mm/cma/ contains a subdirectory for each CMA + heap name (also sometimes called CMA areas). + + Each CMA heap subdirectory (that is, each + /sys/kernel/mm/cma/ directory) contains the + following items: + + alloc_pages_success + alloc_pages_fail + +What: /sys/kernel/mm/cma//alloc_pages_success +Date: Feb 2021 +Contact: Minchan Kim +Description: + the number of pages CMA API succeeded to allocate + +What: /sys/kernel/mm/cma//alloc_pages_fail +Date: Feb 2021 +Contact: Minchan Kim +Description: + the number of pages CMA API failed to allocate diff --git a/mm/Kconfig b/mm/Kconfig index ddfae91..ffcae7b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -538,6 +538,13 @@ config CMA_DEBUGFS help Turns on the DebugFS interface for CMA. +config CMA_SYSFS + bool "CMA information through sysfs interface" + depends on CMA && SYSFS + help + This option exposes some sysfs attributes to get information + from CMA. + config CMA_AREAS int "Maximum count of the CMA areas" depends on CMA diff --git a/mm/Makefile b/mm/Makefile index 7cfa097..de62f2c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -109,6 +109,7 @@ obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o +obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o diff --git a/mm/cma.c b/mm/cma.c index 9f4179a..e827a1c 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -498,10 +498,14 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, pr_debug("%s(): returned %p\n", __func__, page); out: - if (page) + if (page) { count_vm_event(CMA_ALLOC_SUCCESS); - else + cma_sysfs_account_success_pages(cma, count); + } else { count_vm_event(CMA_ALLOC_FAIL); + if (cma) + cma_sysfs_account_fail_pages(cma, count); + } return page; } diff --git a/mm/cma.h b/mm/cma.h index 42ae082..68ffad4 100644 --- a/mm/cma.h +++ b/mm/cma.h @@ -3,6 +3,12 @@ #define __MM_CMA_H__ #include +#include + +struct cma_kobject { + struct kobject kobj; + struct cma *cma; +}; struct cma { unsigned long base_pfn; @@ -16,6 +22,14 @@ struct cma { struct debugfs_u32_array dfs_bitmap; #endif char name[CMA_MAX_NAME]; +#ifdef CONFIG_CMA_SYSFS + /* the number of CMA page successful allocations */ + atomic64_t nr_pages_succeeded; + /* the number of CMA page allocation failures */ + atomic64_t nr_pages_failed; + /* kobject requires dynamic object */ + struct cma_kobject *cma_kobj; +#endif }; extern struct cma cma_areas[MAX_CMA_AREAS]; @@ -26,4 +40,13 @@ static inline unsigned long cma_bitmap_maxno(struct cma *cma) return cma->count >> cma->order_per_bit; } +#ifdef CONFIG_CMA_SYSFS +void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages); +void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages); +#else +static inline void cma_sysfs_account_success_pages(struct cma *cma, + unsigned long nr_pages) {}; +static inline void cma_sysfs_account_fail_pages(struct cma *cma, + unsigned long nr_pages) {}; +#endif #endif diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c new file mode 100644 index 0000000..eb2f39c --- /dev/null +++ b/mm/cma_sysfs.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * CMA SysFS Interface + * + * Copyright (c) 2021 Minchan Kim + */ + +#include +#include +#include + +#include "cma.h" + +#define CMA_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages) +{ + atomic64_add(nr_pages, &cma->nr_pages_succeeded); +} + +void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages) +{ + atomic64_add(nr_pages, &cma->nr_pages_failed); +} + +static inline struct cma *cma_from_kobj(struct kobject *kobj) +{ + return container_of(kobj, struct cma_kobject, kobj)->cma; +} + +static ssize_t alloc_pages_success_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct cma *cma = cma_from_kobj(kobj); + + return sysfs_emit(buf, "%llu\n", + atomic64_read(&cma->nr_pages_succeeded)); +} +CMA_ATTR_RO(alloc_pages_success); + +static ssize_t alloc_pages_fail_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct cma *cma = cma_from_kobj(kobj); + + return sysfs_emit(buf, "%llu\n", atomic64_read(&cma->nr_pages_failed)); +} +CMA_ATTR_RO(alloc_pages_fail); + +static void cma_kobj_release(struct kobject *kobj) +{ + struct cma *cma = cma_from_kobj(kobj); + struct cma_kobject *cma_kobj = cma->cma_kobj; + + kfree(cma_kobj); + cma->cma_kobj = NULL; +} + +static struct attribute *cma_attrs[] = { + &alloc_pages_success_attr.attr, + &alloc_pages_fail_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(cma); + +static struct kobj_type cma_ktype = { + .release = cma_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = cma_groups, +}; + +static int __init cma_sysfs_init(void) +{ + struct kobject *cma_kobj_root; + struct cma_kobject *cma_kobj; + struct cma *cma; + int i, err; + + cma_kobj_root = kobject_create_and_add("cma", mm_kobj); + if (!cma_kobj_root) + return -ENOMEM; + + for (i = 0; i < cma_area_count; i++) { + cma_kobj = kzalloc(sizeof(*cma_kobj), GFP_KERNEL); + if (!cma_kobj) { + err = -ENOMEM; + goto out; + } + + cma = &cma_areas[i]; + cma->cma_kobj = cma_kobj; + cma_kobj->cma = cma; + err = kobject_init_and_add(&cma_kobj->kobj, &cma_ktype, + cma_kobj_root, "%s", cma->name); + if (err) { + kobject_put(&cma_kobj->kobj); + goto out; + } + } + + return 0; +out: + while (--i >= 0) { + cma = &cma_areas[i]; + kobject_put(&cma->cma_kobj->kobj); + } + kobject_put(cma_kobj_root); + + return err; +} +subsys_initcall(cma_sysfs_init); -- 2.7.4 From 9116c4264c0881d1b6aed37d02b68521a9947075 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 21 Apr 2021 14:44:12 +1000 Subject: [PATCH 04/16] mm: use proper type for cma_[alloc|release] MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit size_t in cma_alloc is confusing since it makes people think it's byte count, not pages. Change it to unsigned long[1]. The unsigned int in cma_release is also not right so change it. Since we have unsigned long in cma_release, free_contig_range should also respect it. [1] 67a2e213e7e9, mm: cma: fix incorrect type conversion for size during dma allocation Link: https://lore.kernel.org/linux-mm/20210324043434.GP1719932@casper.infradead.org/ Link: https://lkml.kernel.org/r/20210331164018.710560-1-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: David Hildenbrand Cc: Matthew Wilcox Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Origin: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=78fa51503fdbe463c96eef4c3cf69ca54032647a Signed-off-by: Łukasz Stelmach Change-Id: Ie6de7e88bdae134120d2d88178e5a71cd00bd460 --- include/linux/cma.h | 4 ++-- include/linux/gfp.h | 2 +- include/trace/events/cma.h | 22 +++++++++++----------- mm/cma.c | 17 +++++++++-------- mm/page_alloc.c | 6 +++--- 5 files changed, 26 insertions(+), 25 deletions(-) diff --git a/include/linux/cma.h b/include/linux/cma.h index 217999c..53fd8c3 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -44,9 +44,9 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, unsigned int order_per_bit, const char *name, struct cma **res_cma); -extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, +extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align, bool no_warn); -extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count); +extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); #endif diff --git a/include/linux/gfp.h b/include/linux/gfp.h index c603237..4f96cf1 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -630,7 +630,7 @@ extern int alloc_contig_range(unsigned long start, unsigned long end, extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, int nid, nodemask_t *nodemask); #endif -void free_contig_range(unsigned long pfn, unsigned int nr_pages); +void free_contig_range(unsigned long pfn, unsigned long nr_pages); #ifdef CONFIG_CMA /* CMA stuff */ diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index 5cf385a..c3d3547 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -11,7 +11,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned int count, unsigned int align), + unsigned long count, unsigned int align), TP_ARGS(name, pfn, page, count, align), @@ -19,7 +19,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) - __field(unsigned int, count) + __field(unsigned long, count) __field(unsigned int, align) ), @@ -31,7 +31,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, __entry->align = align; ), - TP_printk("name=%s pfn=%lx page=%p count=%u align=%u", + TP_printk("name=%s pfn=%lx page=%p count=%lu align=%u", __get_str(name), __entry->pfn, __entry->page, @@ -42,7 +42,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, TRACE_EVENT(cma_release, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned int count), + unsigned long count), TP_ARGS(name, pfn, page, count), @@ -50,7 +50,7 @@ TRACE_EVENT(cma_release, __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) - __field(unsigned int, count) + __field(unsigned long, count) ), TP_fast_assign( @@ -60,7 +60,7 @@ TRACE_EVENT(cma_release, __entry->count = count; ), - TP_printk("name=%s pfn=%lx page=%p count=%u", + TP_printk("name=%s pfn=%lx page=%p count=%lu", __get_str(name), __entry->pfn, __entry->page, @@ -69,13 +69,13 @@ TRACE_EVENT(cma_release, TRACE_EVENT(cma_alloc_start, - TP_PROTO(const char *name, unsigned int count, unsigned int align), + TP_PROTO(const char *name, unsigned long count, unsigned int align), TP_ARGS(name, count, align), TP_STRUCT__entry( __string(name, name) - __field(unsigned int, count) + __field(unsigned long, count) __field(unsigned int, align) ), @@ -85,7 +85,7 @@ TRACE_EVENT(cma_alloc_start, __entry->align = align; ), - TP_printk("name=%s count=%u align=%u", + TP_printk("name=%s count=%lu align=%u", __get_str(name), __entry->count, __entry->align) @@ -94,7 +94,7 @@ TRACE_EVENT(cma_alloc_start, DEFINE_EVENT(cma_alloc_class, cma_alloc_finish, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned int count, unsigned int align), + unsigned long count, unsigned int align), TP_ARGS(name, pfn, page, count, align) ); @@ -102,7 +102,7 @@ DEFINE_EVENT(cma_alloc_class, cma_alloc_finish, DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned int count, unsigned int align), + unsigned long count, unsigned int align), TP_ARGS(name, pfn, page, count, align) ); diff --git a/mm/cma.c b/mm/cma.c index e827a1c..b88e5d3 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -81,7 +81,7 @@ static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, } static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, - unsigned int count) + unsigned long count) { unsigned long bitmap_no, bitmap_count; @@ -408,21 +408,21 @@ static inline void cma_debug_show_areas(struct cma *cma) { } * This function allocates part of contiguous memory on specific * contiguous memory area. */ -struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, - bool no_warn) +struct page *cma_alloc(struct cma *cma, unsigned long count, + unsigned int align, bool no_warn) { unsigned long mask, offset; unsigned long pfn = -1; unsigned long start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; - size_t i; + unsigned long i; struct page *page = NULL; int ret = -ENOMEM; if (!cma || !cma->count || !cma->bitmap) goto out; - pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma, + pr_debug("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma, count, align); if (!count) @@ -491,7 +491,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, } if (ret && !no_warn) { - pr_err_ratelimited("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n", + pr_err_ratelimited("%s: %s: alloc failed, req-size: %lu pages, ret: %d\n", __func__, cma->name, count, ret); cma_debug_show_areas(cma); } @@ -520,14 +520,15 @@ out: * It returns false when provided pages do not belong to contiguous area and * true otherwise. */ -bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) +bool cma_release(struct cma *cma, const struct page *pages, + unsigned long count) { unsigned long pfn; if (!cma || !pages) return false; - pr_debug("%s(page %p, count %u)\n", __func__, (void *)pages, count); + pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); pfn = page_to_pfn(pages); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1b3439d..327e033 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8701,9 +8701,9 @@ struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, } #endif /* CONFIG_CONTIG_ALLOC */ -void free_contig_range(unsigned long pfn, unsigned int nr_pages) +void free_contig_range(unsigned long pfn, unsigned long nr_pages) { - unsigned int count = 0; + unsigned long count = 0; for (; nr_pages--; pfn++) { struct page *page = pfn_to_page(pfn); @@ -8711,7 +8711,7 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages) count += page_count(page) != 1; __free_page(page); } - WARN(count != 0, "%d pages are still in use!\n", count); + WARN(count != 0, "%lu pages are still in use!\n", count); } EXPORT_SYMBOL(free_contig_range); -- 2.7.4 From 7d5372737d346d76cf27907d096a57cb08d532af Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Fri, 2 Jul 2021 18:43:36 +0900 Subject: [PATCH 05/16] mm: THP: introducing a fine-grained transparent hugepage technique for ARM64 architecture Transparent hugepage (THP) is one of promise solutions to deal with increased memory footprints, but it mostly focused on server-side environments. This patch claims that embedded systems also get benefits by using THP to deal with increased but still small-sized memory footprints in applications on the embedded system. An ARM64 architecture featured a fine-grained hugepage which support 64KB sized hugepages while the size of commonly used hugepage is 2MB. We used these two kinds of hugepages corresponding to required size of virtual memory. In this patch, we developed an eager-and-conservative policy. With this policy, the kernel do not allow to allocate 2MB hugepages on page faults to decrease enlarged page fault latencies. Instead, the kernel allocates 64KB hugepages to deal with hugepage allocation. Since 64KB hugepages require the smaller order pages than 2MB hugepages, it does not severely affect to user-noticed memory latency due to the memory management tasks such as memory compaction. On the other hand, khugepaged makes both 64KB hugepages and 2MB hugepages for both anonymous pages and file pages corresponding to virtual memory sizes. Moreover, our proposed finegrained THP (fTHP) supports hugepage mappings on pages in CMA. Since pages in CMA already contiguous, fTHP just allows hugepage mappings for 64KB or 2MB aligned memory areas. The proposed method achieves upto 32% of throughput improvement against Linux kernel with default THP that the system runs a read workload in lmbench [1] when the buffer is fitted in the CPU last-level-cache. For the large-sized buffer (bigger than 2MB), the proposed method shows similar throughput to default THP in Linux kernel. [1] LMbench - Tools for peformance analysis: http://lmbench.sourceforge.net Change-Id: I750528db8f04b37fda39052bea775d18ca5d53fb Signed-off-by: Sung-hun Kim Signed-off-by: Marek Szyprowski --- arch/arm64/include/asm/finegrained_thp.h | 11 + arch/arm64/include/asm/huge_mm.h | 261 +++++++ arch/arm64/include/asm/pgtable.h | 16 + arch/arm64/mm/Makefile | 2 + arch/arm64/mm/finegrained_thp.c | 26 + arch/arm64/mm/huge_memory.c | 1090 ++++++++++++++++++++++++++++++ arch/arm64/mm/mmu.c | 16 + fs/proc/meminfo.c | 18 + include/asm-generic/finegrained_thp.h | 8 + include/asm-generic/huge_mm.h | 57 ++ include/linux/huge_mm.h | 12 + include/linux/mm.h | 5 + include/linux/mmu_notifier.h | 19 + include/linux/mmzone.h | 10 + include/linux/pgtable.h | 10 +- include/linux/rmap.h | 3 + include/linux/swapops.h | 13 + include/linux/vm_event_item.h | 3 + include/uapi/asm-generic/mman-common.h | 4 + kernel/dma/Kconfig | 1 + kernel/events/uprobes.c | 7 + mm/Kconfig | 17 + mm/filemap.c | 7 + mm/gup.c | 4 + mm/huge_memory.c | 118 +++- mm/internal.h | 3 + mm/ioremap.c | 51 ++ mm/khugepaged.c | 937 ++++++++++++++++++++++++- mm/madvise.c | 10 + mm/memcontrol.c | 10 + mm/memory.c | 197 +++++- mm/migrate.c | 10 + mm/mmap.c | 41 ++ mm/mprotect.c | 7 + mm/mremap.c | 11 + mm/rmap.c | 86 ++- mm/shmem.c | 115 +++- mm/swap_slots.c | 4 + mm/swapfile.c | 11 +- mm/truncate.c | 5 + mm/vmscan.c | 9 + mm/vmstat.c | 13 + 42 files changed, 3218 insertions(+), 40 deletions(-) create mode 100644 arch/arm64/include/asm/finegrained_thp.h create mode 100644 arch/arm64/include/asm/huge_mm.h create mode 100644 arch/arm64/mm/finegrained_thp.c create mode 100644 arch/arm64/mm/huge_memory.c create mode 100644 include/asm-generic/finegrained_thp.h create mode 100644 include/asm-generic/huge_mm.h diff --git a/arch/arm64/include/asm/finegrained_thp.h b/arch/arm64/include/asm/finegrained_thp.h new file mode 100644 index 0000000..6f3d9bb --- /dev/null +++ b/arch/arm64/include/asm/finegrained_thp.h @@ -0,0 +1,11 @@ +#ifndef __ASM_FINEGRAINED_THP_H +#define __ASM_FINEGRAINED_THP_H +#ifdef CONFIG_FINEGRAINED_THP +extern void khugepaged_mem_hook(struct mm_struct *mm, + unsigned long addr, long diff, const char *debug); +#else /* CONFIG_FINEGRAINED_THP */ +static inline void khugepaged_mem_hook(struct mm_struct *mm, + unsigned long addr, long diff, const char *debug) +{} +#endif /* CONFIG_FINEGRAINED_THP */ +#endif /* __ASM_FINEGRAINED_THP_H */ diff --git a/arch/arm64/include/asm/huge_mm.h b/arch/arm64/include/asm/huge_mm.h new file mode 100644 index 0000000..cc44800 --- /dev/null +++ b/arch/arm64/include/asm/huge_mm.h @@ -0,0 +1,261 @@ +#ifndef __ASM_HUGE_MM_H +#define __ASM_HUGE_MM_H + +#ifdef CONFIG_FINEGRAINED_THP +#include /* for compound_order/compound_nr */ +#include + +#define HPAGE_CONT_PTE_MASK CONT_PTE_MASK +#define HPAGE_CONT_PTE_SIZE CONT_PTE_SIZE +#define HPAGE_CONT_PTE_ORDER (CONT_PTE_SHIFT-PAGE_SHIFT) +#define HPAGE_CONT_PTE_NR (1 << HPAGE_CONT_PTE_ORDER) + +extern int copy_huge_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pte_t *dst_pte, pte_t *src_pte, unsigned long addr, + struct vm_area_struct *vma, int *rss); + +extern vm_fault_t arm64_do_huge_pte_anonymous_page(struct vm_fault *vmf); + +static inline vm_fault_t arch_do_huge_pte_anonymous_page( + struct vm_fault *vmf) +{ + return arm64_do_huge_pte_anonymous_page(vmf); +} + +extern void huge_cont_pte_set_accessed(struct vm_fault *vmf, pte_t orig_pte); +extern int change_huge_pte(struct vm_area_struct *vma, pte_t *pte, + unsigned long addr, pgprot_t newprot, + unsigned long cp_flags); + +extern pte_t ptep_huge_clear_flush(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep); + +/* + * Below codes should be moved to arm64-dependent codes + * Most codes are borrowed from arch/arm64/mm/hugetlbpage.c + */ + +#define HPAGE_CONT_PTE_CACHE_INDEX_MASK (HPAGE_CONT_PTE_NR - 1) + +static inline bool transhuge_adv_vma_suitable(struct vm_area_struct *vma, + unsigned long haddr) +{ + /* Don't have to check pgoff for anonymous vma */ + if (!vma_is_anonymous(vma)) { + if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CONT_PTE_CACHE_INDEX_MASK) + != (vma->vm_pgoff & HPAGE_CONT_PTE_CACHE_INDEX_MASK)) + return false; + } + + if (haddr < vma->vm_start || haddr + HPAGE_CONT_PTE_SIZE >= vma->vm_end) + return false; + return true; +} + +static inline pgprot_t thp_pte_pgprot(pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + + return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); +} + +static inline pte_t arm64_make_huge_pte(struct page *hpage, + struct vm_area_struct *vma) +{ + return pte_mkcont(pte_mkhuge(mk_pte(hpage, vma->vm_page_prot))); +} + +static inline pte_t arch_make_huge_pte(struct page *hpage, + struct vm_area_struct *vma) +{ + return arm64_make_huge_pte(hpage, vma); +} + +static inline void arm64_clear_and_flush(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, + unsigned long pgsize, + unsigned long ncontig) +{ + int i; + struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); + unsigned long saddr = addr; + + for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) + pte_clear(mm, addr, ptep); + + flush_tlb_range(&vma, saddr, addr); +} + +extern int memcmp_pages(struct page *page1, struct page *page2); + +static inline void arm64_set_huge_pte_at(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, pte_t pte, unsigned long headoff) +{ + int i; + unsigned long pfn; + pgprot_t hugeprot; + + pfn = pte_pfn(pte); + hugeprot = thp_pte_pgprot(pte); + + arm64_clear_and_flush(mm, addr, ptep, PAGE_SIZE, HPAGE_CONT_PTE_NR); + + for (i = 0; i < HPAGE_CONT_PTE_NR; i++, ptep++, addr += PAGE_SIZE, pfn += 1) + set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); +} + +static inline void arch_set_huge_pte_at(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, pte_t pte, unsigned long headoff) +{ + arm64_set_huge_pte_at(mm, addr, ptep, pte, headoff); +} + +static inline void arch_clear_huge_pte_range(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + arm64_clear_and_flush(mm, addr, ptep, PAGE_SIZE, HPAGE_CONT_PTE_NR); +} + +extern vm_fault_t arm64_do_set_huge_pte(struct vm_fault *vmf, struct page *page); + +static inline vm_fault_t arch_do_set_huge_pte(struct vm_fault *vmf, + struct page *page) +{ + return arm64_do_set_huge_pte(vmf, page); +} + +extern vm_fault_t arm64_wp_huge_pte(struct vm_fault *vmf, pte_t orig_pte); + +static inline int arch_do_wp_page(struct vm_fault *vmf, pte_t entry) +{ + int ret = VM_FAULT_FALLBACK; + + if (pte_cont(entry)) + ret = arm64_wp_huge_pte(vmf, entry); + return ret; +} + +extern void huge_cont_pte_set_accessed(struct vm_fault *vmf, pte_t orig_pte); + +static inline bool arch_huge_pte_set_accessed(struct vm_fault *vmf, pte_t entry) +{ + if (pte_cont(entry)) { + huge_cont_pte_set_accessed(vmf, entry); + return true; + } + return false; +} + +static inline pte_t arch_pte_clearhuge(pte_t pte) +{ + if (pte_cont(pte)) + return pte_clearhuge(pte); + return pte; +} + +extern int arm64_remap_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot); + +static inline int arch_remap_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + return arm64_remap_pte_range(mm, pmd, addr, end, pfn, prot); +} + +void __split_huge_pte(struct vm_area_struct *vma, pmd_t *pmd, + pte_t *pte, unsigned long address, + bool freeze, struct page *page); + +#define split_huge_pte(__vma, __pmd, __pte, __address) \ + do { \ + pte_t *____pte = (__pte); \ + if (is_swap_pte(*____pte) || pte_cont(*____pte) \ + || pte_devmap(*____pte)) \ + __split_huge_pte(__vma, __pmd, __pte, __address, \ + false, NULL); \ + } while (0) + +void split_huge_pte_address(struct vm_area_struct *vma, unsigned long address, + bool freeze, struct page *page); +extern bool arm64_hugepage_vma_shmem_check(struct vm_area_struct *vma, + unsigned long vm_flags, int nr_pages); +extern bool arm64_hugepage_vma_file_check(struct vm_area_struct *vma, + unsigned long vm_flags, int nr_pages); + +static inline bool arch_hugepage_vma_shmem_check( + struct vm_area_struct *vma, + unsigned long vm_flags) +{ + return arm64_hugepage_vma_shmem_check(vma, vm_flags, + HPAGE_CONT_PTE_NR); +} + +static inline bool arch_hugepage_vma_file_check( + struct vm_area_struct *vma, + unsigned long vm_flags) +{ + return arm64_hugepage_vma_file_check(vma, vm_flags, + HPAGE_CONT_PTE_NR); +} + +#else /* CONFIG_FINEGRAINED_THP */ + +static inline int arch_do_wp_page(struct vm_fault *vmf, pte_t entry) +{ + return VM_FAULT_FALLBACK; +} + +static inline bool arch_huge_pte_set_accessed(struct vm_fault *vmf, + pte_t entry) +{ + return false; +} + +static inline pte_t arch_pte_clearhuge(pte_t pte) +{ + return pte; +} + +static inline pte_t arch_make_huge_pte(struct page *hpage, + struct vm_area_struct *vma) +{ + return mk_pte(hpage, vma->vm_page_prot); +} + +static inline vm_fault_t arch_do_huge_pte_anonymous_page(struct vm_fault *vmf) +{ + return VM_FAULT_FALLBACK; +} + +static inline vm_fault_t arch_do_set_huge_pte(struct vm_fault *vmf, + struct page *page) +{ + return VM_FAULT_FALLBACK; +} + +static inline void arch_set_huge_pte_at(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, pte_t pte, unsigned long headoff) +{} + +static inline void arch_clear_huge_pte_range(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{} + +static inline bool arch_hugepage_vma_shmem_check( + struct vm_area_struct *vma, + unsigned long vm_flags) +{ + return false; +} + +static inline bool arch_hugepage_vma_file_check( + struct vm_area_struct *vma, + unsigned long vm_flags) +{ + return false; +} +#endif /* CONFIG_FINEGRAINED_THP */ +#endif /* __ASM_HUGE_MM_H */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 717f13d5..05ff2c5 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -153,6 +153,14 @@ static inline pte_t set_pte_bit(pte_t pte, pgprot_t prot) return pte; } +#ifdef CONFIG_FINEGRAINED_THP +static inline pte_t pte_clearhuge(pte_t pte) +{ + pte = clear_pte_bit(pte, __pgprot(PTE_CONT)); + return pte; +} +#endif /* CONFIG_FINEGRAINED_THP */ + static inline pmd_t clear_pmd_bit(pmd_t pmd, pgprot_t prot) { pmd_val(pmd) &= ~pgprot_val(prot); @@ -325,6 +333,14 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, */ #define pte_mkhuge(pte) (__pte(pte_val(pte) & ~PTE_TABLE_BIT)) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#ifdef CONFIG_FINEGRAINED_THP +/* 64KB hugepage definition for THP */ +#define pte_trans_huge(pte) (pte_val(pte) && !(pte_val(pte) & PTE_TABLE_BIT)) +#endif /* CONFIG_FINEGRAINED_THP */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + + /* * Hugetlb definitions. */ diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index 5ead3c3..a1d152e 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -9,6 +9,8 @@ obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o obj-$(CONFIG_ARM64_MTE) += mteswap.o +obj-$(CONFIG_FINEGRAINED_THP) += huge_memory.o +obj-$(CONFIG_FINEGRAINED_THP) += finegrained_thp.o KASAN_SANITIZE_physaddr.o += n obj-$(CONFIG_KASAN) += kasan_init.o diff --git a/arch/arm64/mm/finegrained_thp.c b/arch/arm64/mm/finegrained_thp.c new file mode 100644 index 0000000..5ebb4ac --- /dev/null +++ b/arch/arm64/mm/finegrained_thp.c @@ -0,0 +1,26 @@ +#include +#include + +bool arm64_hugepage_vma_shmem_check(struct vm_area_struct *vma, + unsigned long vm_flags, int nr_pages) +{ + /* Enabled via shmem mount options or sysfs settings. */ + if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) { + return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, + nr_pages); + } + return false; +} + +bool arm64_hugepage_vma_file_check(struct vm_area_struct *vma, + unsigned long vm_flags, int nr_pages) +{ + /* Read-only file mappings need to be aligned for THP to work. */ + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file && + (vm_flags & VM_DENYWRITE)) { + return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, + nr_pages); + } + return false; +} + diff --git a/arch/arm64/mm/huge_memory.c b/arch/arm64/mm/huge_memory.c new file mode 100644 index 0000000..2ef1a21 --- /dev/null +++ b/arch/arm64/mm/huge_memory.c @@ -0,0 +1,1090 @@ +/* + * Hugepage support for arm64 architecture + * + * 21.08.07. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef CONFIG_FINEGRAINED_THP +pte_t ptep_huge_clear_flush(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + pte_t pte; + int i; + + VM_BUG_ON(address & ~HPAGE_CONT_PTE_MASK); + VM_BUG_ON(!pte_cont(*ptep)); + pte = ptep_get_and_clear(vma->vm_mm, address, ptep); + + for (i = 1; i < HPAGE_CONT_PTE_NR; i++) + ptep_get_and_clear(vma->vm_mm, address + PAGE_SIZE * i, ptep + i); + + flush_tlb_range(vma, address, address + HPAGE_CONT_PTE_SIZE); + return pte; +} + +#define USE_THP_PRINT_CONT_TABLE +#ifdef USE_THP_PRINT_CONT_TABLE +void thp_print_cont_pte_table(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, unsigned long line) +{ + int i, pid = 0; + + if (mm->owner) { + pr_info("THP: %s from %lu proc-%d(%s)\n", __func__, line, + task_pid_nr(mm->owner), mm->owner->comm); + pid = task_pid_nr(mm->owner); + } else + pr_info("THP: %s from %lu\n", __func__, line); + for (i = 0; i < HPAGE_CONT_PTE_NR; i++, ptep++, addr += PAGE_SIZE) { + pr_info("%lx: %llx pid(%d)\n", addr, pte_val(*ptep), pid); + } +} +#else +void thp_print_cont_pte_table(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, unsigned long line) +{} +#endif /* USE_THP_PRINT_CONT_TABLE */ + +/* + * always: directly stall for all thp allocations + * defer: wake kswapd and fail if not immediately available + * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise + * fail if not immediately available + * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately + * available + * never: never stall for any thp allocation + */ +static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) +{ + const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + + /* Always do synchronous compaction */ + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); + + /* Kick kcompactd and fail quickly */ + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + + /* Synchronous compaction if madvised, otherwise kick kcompactd */ + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | + (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); + + /* Only do synchronous compaction if madvised */ + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | + (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); + + return GFP_TRANSHUGE_LIGHT; +} + +/* + * a caller must hold both locks of dst and src + */ +int copy_huge_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pte_t *dst_pte, pte_t *src_pte, unsigned long haddr, + struct vm_area_struct *vma, int *rss) +{ + struct page *src_page; + unsigned long addr = haddr; + pte_t pte, *_pte; + + pte = *src_pte; + + src_page = vm_normal_page(vma, addr, pte); + if (!src_page) + return -EAGAIN; + + VM_BUG_ON_PAGE(!PageHead(src_page), src_page); + get_page(src_page); + page_dup_rmap(src_page, true); + if (rss) + rss[MM_ANONPAGES] += HPAGE_CONT_PTE_NR; + else + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_CONT_PTE_NR); + + _pte = src_pte; + while (addr < haddr + HPAGE_CONT_PTE_SIZE) { + ptep_set_wrprotect(src_mm, addr, _pte); + addr += PAGE_SIZE; + } + pte = pte_mkold(pte_wrprotect(pte)); + arm64_set_huge_pte_at(dst_mm, haddr, dst_pte, pte, 0); + + return 0; +} + +vm_fault_t arm64_do_set_huge_pte(struct vm_fault *vmf, struct page *page) +{ + int i; + pte_t entry; + struct vm_area_struct *vma = vmf->vma; + bool write = vmf->flags & FAULT_FLAG_WRITE; + unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK; + pgoff_t index, pgoff, addroff, headoff; + vm_fault_t ret = VM_FAULT_FALLBACK; + + if (!transhuge_adv_vma_suitable(vma, haddr)) + return VM_FAULT_FALLBACK; + + page = compound_head(page); + index = page->index; + pgoff = vmf->pgoff; + addroff = (vmf->address - haddr) >> PAGE_SHIFT; + + if (pgoff - index != addroff) + return VM_FAULT_FALLBACK; + + /* + * Archs like ppc64 need additonal space to store information + * related to pte entry. Use the preallocated table for that. + */ + if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { + vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); + if (!vmf->prealloc_pte) + return VM_FAULT_OOM; + smp_wmb(); /* See comment in __pte_alloc() */ + } + + if (unlikely(pmd_none(*vmf->pmd))) { + if (pte_alloc(vma->vm_mm, vmf->pmd)) + return VM_FAULT_OOM; + smp_wmb(); + } + + /* The head offset indicates the position of the first page in the hugepage */ + headoff = (addroff + (HPAGE_CONT_PTE_NR - pgoff)) % HPAGE_CONT_PTE_NR; + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, haddr, &vmf->ptl); + if (!vmf->pte || unlikely(!pte_none(*vmf->pte))) { + spin_unlock(vmf->ptl); + vmf->pte = NULL; + return ret; + } + + entry = arm64_make_huge_pte(compound_head(page), vma); + if (write) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + for (i = 0; i < HPAGE_CONT_PTE_NR; i++) + flush_icache_page(vma, page + i); + if (write && !(vma->vm_flags & VM_SHARED)) { + add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_CONT_PTE_NR); + if (PageAnon(page)) + page_add_new_anon_rmap(page, vma, haddr, true); + } else { + add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_CONT_PTE_NR); + page_add_file_rmap(page, true); + } + + arm64_set_huge_pte_at(vma->vm_mm, haddr, vmf->pte, entry, headoff); + update_mmu_cache_pmd(vma, haddr, vmf->pmd); + count_vm_event(THP_FILE_MAPPED); + return 0; +} + +static vm_fault_t arm64_do_huge_pte_wp_page_fallback(struct vm_fault *vmf, + pte_t orig_pte, struct page *page) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK; + int i; + vm_fault_t ret = 0; + struct page **pages; + struct mmu_notifier_range range; + + pages = kmalloc_array(HPAGE_CONT_PTE_NR, sizeof(struct page *), + GFP_KERNEL); + if (unlikely(!pages)) { + ret |= VM_FAULT_OOM; + goto out; + } + + for (i = 0; i < HPAGE_CONT_PTE_NR; i++) { + pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, + vmf->address); + if (unlikely(!pages[i] || + mem_cgroup_charge(pages[i], vma->vm_mm, + GFP_KERNEL))) { + if (pages[i]) + put_page(pages[i]); + while (--i >= 0) { + put_page(pages[i]); + } + kfree(pages); + ret |= VM_FAULT_OOM; + goto out; + } + } + + for (i = 0; i < HPAGE_CONT_PTE_NR; i++) { + copy_user_highpage(pages[i], page + i, + haddr + PAGE_SIZE * i, vma); + __SetPageUptodate(pages[i]); + cond_resched(); + } + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_CONT_PTE_SIZE); + mmu_notifier_invalidate_range_start(&range); + + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pte_same(*vmf->pte, orig_pte))) + goto out_free_pages; + VM_BUG_ON_PAGE(!PageHead(page), page); + + /* + * Leave pmd empty until pte is filled note we must notify here as + * concurrent CPU thread might write to new page before the call to + * mmu_notifier_invalidate_range_end() happens which can lead to a + * device seeing memory write in different order than CPU. + * + * See Documentation/vm/mmu_notifier.rst + */ + vmf->pte = pte_offset_map(vmf->pmd, haddr); + ptep_huge_clear_flush_notify(vma, haddr, vmf->pte); + + for (i = 0; i < HPAGE_CONT_PTE_NR; i++, haddr += PAGE_SIZE) { + pte_t entry; + entry = mk_pte(pages[i], vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + set_page_private(pages[i], 0); + + page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false); + lru_cache_add_inactive_or_unevictable(pages[i], vma); + vmf->pte = pte_offset_map(vmf->pmd, haddr); + VM_BUG_ON(!pte_none(*vmf->pte)); + set_pte_at(vma->vm_mm, haddr, vmf->pte, entry); + pte_unmap(vmf->pte); + } + kfree(pages); + + smp_wmb(); /* make pte visible before pmd */ + page_remove_rmap(page, true); + spin_unlock(vmf->ptl); + + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above pmdp_huge_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(&range); + + ret |= VM_FAULT_WRITE; + put_page(page); + +out: + return ret; + +out_free_pages: + spin_unlock(vmf->ptl); + mmu_notifier_invalidate_range_end(&range); + for (i = 0; i < HPAGE_CONT_PTE_NR; i++) { + set_page_private(pages[i], 0); + put_page(pages[i]); + } + kfree(pages); + goto out; +} + +vm_fault_t arm64_do_huge_pte_wp_page(struct vm_fault *vmf, pte_t orig_pte) +{ + struct vm_area_struct *vma = vmf->vma; + struct page *page = NULL, *new_page; + unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK; + struct mmu_notifier_range range; + gfp_t huge_gfp; /* for allocation and charge */ + vm_fault_t ret = 0; + + vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); + VM_BUG_ON_VMA(!vma->anon_vma, vma); + + spin_lock(vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, orig_pte))) { + spin_unlock(vmf->ptl); + return ret; + } + + page = pte_page(orig_pte); + VM_BUG_ON_PAGE(!PageCompound(page), page); + page = compound_head(page); + /* + * We can only reuse the page if nobody else maps the huge page or it's + * part. + */ + if (!trylock_page(page)) { + get_page(page); + spin_unlock(vmf->ptl); + lock_page(page); + spin_lock(vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, orig_pte))) { + spin_unlock(vmf->ptl); + unlock_page(page); + put_page(page); + return 0; + } + put_page(page); + } + + if (reuse_swap_page(page, NULL)) { + huge_cont_pte_set_accessed(vmf, orig_pte); + unlock_page(page); + spin_unlock(vmf->ptl); + return VM_FAULT_WRITE; + } + unlock_page(page); + get_page(page); + spin_unlock(vmf->ptl); + + /* + * For 2MB hugepage, the kernel just splits it + * into standard-sized pages and fallbacks to + * normal page fault handling path. + * + * For 64KB hugepage, I think alloc-on-COW can + * be get a performance benefit. This is because, + * significant time is consumed for copying contents + * of 2MB page, but 64KB page is much smaller than + * 2MB page. So, I guess that the overhead can be + * negligible. + * + * TODO: accounting time overhead of below procedure + */ +#ifdef CONFIG_THP_CONSERVATIVE + goto fallback; +#endif + if (__transparent_hugepage_enabled(vma)) { + huge_gfp = alloc_hugepage_direct_gfpmask(vma); + new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, + HPAGE_CONT_PTE_ORDER); + } else + new_page = NULL; + + if (likely(new_page)) { + prep_transhuge_page(new_page); + } else { + if (!page) { + split_huge_pte(vma, vmf->pmd, vmf->pte, vmf->address); + ret |= VM_FAULT_FALLBACK; + } else { + ret = arm64_do_huge_pte_wp_page_fallback(vmf, orig_pte, page); + if (ret & VM_FAULT_OOM) { + split_huge_pte(vma, vmf->pmd, vmf->pte, vmf->address); + ret |= VM_FAULT_FALLBACK; + } + put_page(page); + } + count_vm_event(THP_FAULT_FALLBACK); + goto out; + } + + if (unlikely(mem_cgroup_charge(new_page, vma->vm_mm, + huge_gfp))) { + put_page(new_page); + split_huge_pte(vma, vmf->pmd, vmf->pte, vmf->address); + if (page) + put_page(page); + ret |= VM_FAULT_FALLBACK; + count_vm_event(THP_FAULT_FALLBACK); + goto out; + } + + count_vm_event(THP_FAULT_ALLOC); + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); + + if (!page) + clear_huge_page(new_page, vmf->address, HPAGE_CONT_PTE_NR); + else + copy_user_huge_page(new_page, page, vmf->address, + vma, HPAGE_CONT_PTE_NR); + __SetPageUptodate(new_page); + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_CONT_PTE_SIZE); + mmu_notifier_invalidate_range_start(&range); + + spin_lock(vmf->ptl); + if (page) + put_page(page); + if (unlikely(!pte_same(*vmf->pte, orig_pte))) { + spin_unlock(vmf->ptl); + mem_cgroup_uncharge(new_page); + put_page(new_page); + goto out_mn; + } else { + pte_t entry; + + entry = arm64_make_huge_pte(new_page, vma); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + + vmf->pte = pte_offset_map(vmf->pmd, haddr); + + page_add_new_anon_rmap(new_page, vma, haddr, true); + lru_cache_add_inactive_or_unevictable(new_page, vma); + + arm64_set_huge_pte_at(vma->vm_mm, haddr, vmf->pte, entry, 0); + update_mmu_cache(vma, vmf->address, vmf->pte); + + if (!page) { + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_CONT_PTE_NR); + } else { + VM_BUG_ON_PAGE(!PageHead(page), page); + page_remove_rmap(page, true); + put_page(page); + } + ret |= VM_FAULT_WRITE; + } + spin_unlock(vmf->ptl); +out_mn: + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above pmdp_huge_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(&range); +out: + return ret; +#ifdef CONFIG_THP_CONSERVATIVE +fallback: + __split_huge_pte(vma, vmf->pmd, vmf->pte, vmf->address, false, NULL); + return VM_FAULT_FALLBACK; +#endif /* CONFIG_THP_CONSERVATIVE */ +} + +/* the caller must hold lock */ +vm_fault_t arm64_wp_huge_pte(struct vm_fault *vmf, pte_t orig_pte) +{ + unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK; + pte_t *hpte_p; + + if (vma_is_anonymous(vmf->vma)) { + spin_unlock(vmf->ptl); + return arm64_do_huge_pte_wp_page(vmf, orig_pte); + } + + VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); + + hpte_p = pte_offset_map(vmf->pmd, haddr); + spin_unlock(vmf->ptl); + __split_huge_pte(vmf->vma, vmf->pmd, hpte_p, haddr, false, NULL); + spin_lock(vmf->ptl); + + return VM_FAULT_FALLBACK; +} + +static inline int check_huge_pte_range(pte_t *head) +{ + int i; + + for (i = 0; i < HPAGE_CONT_PTE_NR; i++, head++) { + if (!pte_none(*head)) + return 1; + } + return 0; +} + +void thp_print_cont_pte_table(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, unsigned long line); + +static vm_fault_t __do_huge_pte_anonymous_page(struct vm_fault *vmf, + struct page *page, gfp_t gfp) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long offset, haddr = vmf->address & HPAGE_CONT_PTE_MASK; + pte_t entry; + vm_fault_t ret = 0; + + VM_BUG_ON_PAGE(!PageCompound(page), page); + + if (mem_cgroup_charge(page, vma->vm_mm, gfp)) { + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + count_vm_event(THP_FAULT_FALLBACK_CHARGE); + return VM_FAULT_FALLBACK; + } + cgroup_throttle_swaprate(page, gfp); + + clear_huge_page(compound_head(page), haddr, HPAGE_CONT_PTE_NR); + /* + * The memory barrier inside __SetPageUptodate makes sure that + * clear_huge_page writes become visible before the set_pmd_at() + * write. + */ + __SetPageUptodate(page); + + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock_release; + + if (userfaultfd_missing(vma)) { + spin_unlock(vmf->ptl); + put_page(page); + ret = handle_userfault(vmf, VM_UFFD_MISSING); + VM_BUG_ON(ret & VM_FAULT_FALLBACK); + return ret; + } + + entry = arm64_make_huge_pte(page, vma); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + offset = (vmf->address - haddr) >> PAGE_SHIFT; + vmf->pte = pte_offset_map(vmf->pmd, vmf->address); + if (!pte_none(*vmf->pte)) { + ret = VM_FAULT_FALLBACK; + goto unlock_release; + } + if (check_huge_pte_range(vmf->pte - offset)) { + /* recheck */ + /* TODO: COPY? */ + ret = VM_FAULT_FALLBACK; + goto unlock_release; + } + + page_add_new_anon_rmap(page, vma, haddr, true); + lru_cache_add_inactive_or_unevictable(page, vma); + arm64_set_huge_pte_at(vma->vm_mm, haddr, vmf->pte - offset, entry, 0); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_CONT_PTE_NR); + + spin_unlock(vmf->ptl); + + count_vm_event(THP_FAULT_ALLOC); + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); + + return 0; + +unlock_release: + spin_unlock(vmf->ptl); + put_page(page); + + return ret; +} + +vm_fault_t arm64_do_huge_pte_anonymous_page(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct page *page; + unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK; + spinlock_t *ptl; + gfp_t gfp; + + if (!transhuge_adv_vma_suitable(vma, haddr)) + return VM_FAULT_FALLBACK; + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + if (unlikely(khugepaged_enter(vma, vma->vm_flags))) + return VM_FAULT_OOM; + if (!(vmf->flags & FAULT_FLAG_WRITE) && + !mm_forbids_zeropage(vma->vm_mm) && + transparent_hugepage_use_zero_page()) { + return VM_FAULT_FALLBACK; + } + ptl = pmd_lock(vma->vm_mm, vmf->pmd); + vmf->pte = pte_offset_map(vmf->pmd, haddr); + if (check_huge_pte_range(vmf->pte)) { + pte_unmap(vmf->pte); + spin_unlock(ptl); + return VM_FAULT_FALLBACK; + } + pte_unmap(vmf->pte); + spin_unlock(ptl); + + gfp = alloc_hugepage_direct_gfpmask(vma); + page = alloc_hugepage_vma(gfp, vma, + haddr, + HPAGE_CONT_PTE_ORDER); + if (unlikely(!page)) { + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } + prep_transhuge_page(page); + return __do_huge_pte_anonymous_page(vmf, page, gfp); +} + +bool zap_cont_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, pte_t **ptep, unsigned long *addr, + unsigned long end, struct page *page, + int *rss, spinlock_t *ptl) +{ + struct mm_struct *mm = tlb->mm; + unsigned long haddr = (*addr) & HPAGE_CONT_PTE_MASK; + unsigned long range_end = + ((haddr + HPAGE_CONT_PTE_SIZE) > end) ? end : + haddr + HPAGE_CONT_PTE_SIZE; + size_t size = range_end - haddr; + unsigned long map_count = size >> PAGE_SHIFT; + pte_t *pte; + + pte = pte_offset_map(pmd, haddr); + + if ((*addr) == haddr && haddr + HPAGE_CONT_PTE_SIZE <= range_end) { + arm64_clear_and_flush(mm, *addr, pte, PAGE_SIZE, map_count); + page_remove_rmap(compound_head(page), true); + rss[mm_counter(page)] -= map_count; + __tlb_adjust_range(tlb, *addr, size); + __tlb_remove_tlb_entry(tlb, pte, *addr); + tlb_remove_page_size(tlb, page, size); + + *addr += size; + pte += map_count; + + if (*addr >= end) + *addr = end - PAGE_SIZE; + + *ptep = pte; + } else { + if (haddr < vma->vm_start) { + pr_err("haddr(%lx) is less than vm start(%lx)\n", + haddr, vma->vm_start); + thp_print_cont_pte_table(mm, haddr, pte, __LINE__); + } + + spin_unlock(ptl); + __split_huge_pte(vma, pmd, pte, haddr, false, NULL); + spin_lock(ptl); + } + + pte_unmap(pte); + + return map_count == HPAGE_CONT_PTE_NR; +} + +/* caller must hold a proper lock */ +void huge_cont_pte_set_accessed(struct vm_fault *vmf, pte_t orig_pte) +{ + int i; + pte_t entry, *pte; + unsigned long haddr; + bool write = vmf->flags & FAULT_FLAG_WRITE; + + haddr = vmf->address & HPAGE_CONT_PTE_MASK; + pte = pte_offset_map(vmf->pmd, haddr); + + for (i = 0; i < HPAGE_CONT_PTE_NR; i++, pte++, haddr += PAGE_SIZE) { + entry = pte_mkyoung(*pte); + if (write) + entry = pte_mkwrite(pte_mkdirty(entry)); + ptep_set_access_flags(vmf->vma, haddr, pte, entry, write); + } + update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd); +} + +/* + * FOLL_FORCE can write to even unwritable pmd's, but only + * after we've gone through a COW cycle and they are dirty. + */ +static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) +{ + return pmd_write(pmd) || + ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); +} + +extern void mlock_vma_page(struct page *page); +extern void clear_page_mlock(struct page *page); + +struct page *follow_trans_huge_pte(struct vm_area_struct *vma, + unsigned long addr, + pmd_t *pmd, + unsigned int flags) +{ + struct mm_struct *mm = vma->vm_mm; + struct page *page = NULL; + pte_t *pte; + + assert_spin_locked(pmd_lockptr(mm, pmd)); + + if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags)) + goto out; + + /* Avoid dumping huge zero page */ + if ((flags & FOLL_DUMP)) + return ERR_PTR(-EFAULT); + + /* Full NUMA hinting faults to serialise migration in fault paths */ + if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) + goto out; + + pte = pte_offset_map(pmd, addr); + page = pte_page(*pte); + VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); + + if (!try_grab_page(page, flags)) + return ERR_PTR(-ENOMEM); + + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + /* + * We don't mlock() pte-mapped THPs. This way we can avoid + * leaking mlocked pages into non-VM_LOCKED VMAs. + * + * For anon THP: + * + * In most cases the pmd is the only mapping of the page as we + * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for + * writable private mappings in populate_vma_page_range(). + * + * The only scenario when we have the page shared here is if we + * mlocking read-only mapping shared over fork(). We skip + * mlocking such pages. + * + * For file THP: + * + * We can expect PageDoubleMap() to be stable under page lock: + * for file pages we set it in page_add_file_rmap(), which + * requires page to be locked. + */ + + if (PageAnon(page) && compound_mapcount(page) != 1) + goto skip_mlock; + if (PageDoubleMap(page) || !page->mapping) + goto skip_mlock; + if (!trylock_page(page)) + goto skip_mlock; + if (page->mapping && !PageDoubleMap(page)) + mlock_vma_page(page); + unlock_page(page); + } +skip_mlock: + page += (addr & ~HPAGE_CONT_PTE_MASK) >> PAGE_SHIFT; + VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); + +out: + return page; +} + +static inline pte_t ptep_invalidate(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + return __pte(xchg_relaxed(&pte_val(*ptep), (pte_val(*ptep) & ~PTE_VALID))); +} + +extern atomic_long_t nr_phys_cont_pte_pages; + +static int remap_try_huge_pte(struct mm_struct *mm, pte_t *pte, unsigned long addr, + unsigned long end, unsigned long pfn, + pgprot_t prot) +{ + phys_addr_t phys_addr = __pfn_to_phys(pfn); + pte_t entry; + + if ((end - addr) != CONT_PTE_SIZE) + return 0; + + if (!IS_ALIGNED(addr, CONT_PTE_SIZE)) + return 0; + + if (!IS_ALIGNED(phys_addr, CONT_PTE_SIZE)) + return 0; + + entry = pte_mkspecial(pte_mkcont(pte_mkhuge(pfn_pte(pfn, prot)))); + arch_set_huge_pte_at(mm, addr, pte, entry, 0); + + atomic_long_add(HPAGE_CONT_PTE_NR, &nr_phys_cont_pte_pages); + + return 1; +} + +int arm64_remap_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pte_t *pte, *mapped_pte; + unsigned long next; + spinlock_t *ptl; + int err = 0; + + mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; + arch_enter_lazy_mmu_mode(); + do { + BUG_ON(!pte_none(*pte)); + if (!pfn_modify_allowed(pfn, prot)) { + err = -EACCES; + break; + } + + next = pte_cont_addr_end(addr, end); + if (remap_try_huge_pte(mm, pte, addr, next, pfn, prot)) { + pte += HPAGE_CONT_PTE_NR; + pfn += HPAGE_CONT_PTE_NR; + addr += HPAGE_CONT_PTE_SIZE; + } else { + set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); + pfn++; + pte++; + addr += PAGE_SIZE; + } + } while (addr != end); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(mapped_pte, ptl); + return err; +} + +/* caller must hold appropriate lock (pmd lock) */ +int change_huge_pte(struct vm_area_struct *vma, pte_t *pte, + unsigned long addr, pgprot_t newprot, unsigned long cp_flags) +{ + struct mm_struct *mm = vma->vm_mm; + pte_t entry; + bool preserve_write; + bool prot_numa = cp_flags & MM_CP_PROT_NUMA; + int i, ret; + + preserve_write = prot_numa && pte_write(*pte); + ret = 1; + + /* currently, we don't consider numa cases, but just remain them + * for the future work */ + if (prot_numa && is_huge_zero_page(pte_page(*pte))) + goto out; + + if (prot_numa && pte_protnone(*pte)) + goto out; + + for (i = 0; i < HPAGE_CONT_PTE_NR; i++) { + entry = ptep_invalidate(vma, addr, pte); + entry = pte_modify(entry, newprot); + if (preserve_write) + entry = pte_mk_savedwrite(entry); + entry = pte_mkcont(entry); + + set_pte_at(mm, addr, pte, entry); + pte++; + addr += PAGE_SIZE; + } + + flush_tlb_range(vma, addr, addr + HPAGE_CONT_PTE_SIZE); + ret = HPAGE_CONT_PTE_NR; +out: + return ret; +} + +static void __split_huge_pte_locked(struct vm_area_struct *vma, pte_t *pte, + unsigned long haddr, bool freeze) +{ + struct mm_struct *mm = vma->vm_mm; + struct page *page; + pte_t old_pte, _pte; + bool young, write, soft_dirty, pte_migration = false, uffd_wp = false; + unsigned long addr; + int i; + + VM_BUG_ON(haddr & ~HPAGE_CONT_PTE_MASK); + VM_BUG_ON_VMA(vma->vm_start > haddr, vma); + VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_CONT_PTE_SIZE, vma); + + count_vm_event(THP_SPLIT_CONT_PTE); + + if (!vma_is_anonymous(vma)) { + _pte = ptep_huge_clear_flush_notify(vma, haddr, pte); + if (vma_is_dax(vma)) + return; + page = pte_page(_pte); + if (!PageDirty(page) && pte_dirty(_pte)) + set_page_dirty(page); + if (!PageReferenced(page) && pte_young(_pte)) + SetPageReferenced(page); + page_remove_rmap(page, true); + put_page(page); + add_mm_counter(mm, mm_counter_file(page), -HPAGE_CONT_PTE_NR); + return; + } else if (is_huge_zero_page(pte_page(*pte))) { + pr_err("contiguous pte mapping for zero anon pages are not supported yet"); + BUG(); + } + + old_pte = ptep_huge_clear_flush_notify(vma, haddr, pte); + + pte_migration = is_pte_migration_entry(old_pte); + if (unlikely(pte_migration)) { + swp_entry_t entry; + + entry = pte_to_swp_entry(old_pte); + page = pfn_to_page(swp_offset(entry)); + write = is_write_migration_entry(entry); + young = false; + soft_dirty = pte_swp_soft_dirty(old_pte); + uffd_wp = pte_swp_uffd_wp(old_pte); + } else { + page = pte_page(old_pte); + if (pte_dirty(old_pte)) + SetPageDirty(page); + write = pte_write(old_pte); + young = pte_young(old_pte); + soft_dirty = pte_soft_dirty(old_pte); + uffd_wp = pte_uffd_wp(old_pte); + } + + VM_BUG_ON_PAGE(!page_count(page), page); + page_ref_add(page, HPAGE_CONT_PTE_NR - 1); + + for (i = 0, addr = haddr; i < HPAGE_CONT_PTE_NR; + i++, addr += PAGE_SIZE, pte++) { + pte_t entry; + + if (freeze || pte_migration) { + swp_entry_t swp_entry; + swp_entry = make_migration_entry(page + i, write); + entry = swp_entry_to_pte(swp_entry); + if (soft_dirty) + entry = pte_swp_mksoft_dirty(entry); + if (uffd_wp) + entry = pte_swp_mkuffd_wp(entry); + } else { + entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); + entry = maybe_mkwrite(entry, vma); + if (!write) + entry = pte_wrprotect(entry); + if (!young) + entry = pte_mkold(entry); + if (soft_dirty) + entry = pte_mksoft_dirty(entry); + if (uffd_wp) + entry = pte_mkuffd_wp(entry); + } + //BUG_ON(!pte_none(*pte)); + set_pte_at(mm, addr, pte, entry); + if (!pte_migration) + atomic_inc(&page[i]._mapcount); + pte_unmap(pte); + } + + if (!pte_migration) { + /* + * Set PG_double_map before dropping compound_mapcount to avoid + * false-negative page_mapped(). + */ + if (compound_mapcount(page) > 1 && + !TestSetPageDoubleMap(page)) { + for (i = 0; i < HPAGE_CONT_PTE_NR; i++) + atomic_inc(&page[i]._mapcount); + } + + lock_page_memcg(page); + if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { + /* Last compound_mapcount is gone. */ + __dec_lruvec_page_state(page, NR_ANON_64KB_THPS); + if (TestClearPageDoubleMap(page)) { + /* No need in mapcount reference anymore */ + for (i = 0; i < HPAGE_CONT_PTE_NR; i++) + atomic_dec(&page[i]._mapcount); + } + } + unlock_page_memcg(page); + } + + smp_wmb(); + + if (freeze) { + for (i = 0; i < HPAGE_CONT_PTE_NR; i++) { + page_remove_rmap(page + i, false); + put_page(page + i); + } + } +} + +void __split_huge_pte(struct vm_area_struct *vma, pmd_t *pmd, + pte_t *pte, unsigned long address, + bool freeze, struct page *page) +{ + spinlock_t *ptl; + struct mmu_notifier_range range; + pte_t _pte; + bool locked = false; + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + address & HPAGE_CONT_PTE_MASK, + (address & HPAGE_CONT_PTE_MASK) + HPAGE_CONT_PTE_SIZE); + mmu_notifier_invalidate_range_start(&range); + ptl = pmd_lock(vma->vm_mm, pmd); + + if (page) { + VM_WARN_ON_ONCE(!PageLocked(page)); + if (page != pte_page(*pte)) + goto out; + } +repeat: + if (pte_cont(*pte)) { + if (!page) { + page = pte_page(*pte); + /* + * An anonymous page must be locked, to ensure that a + * concurrent reuse_swap_page() sees stable mapcount; + * but reuse_swap_page() is not used on shmem or file, + * and page lock must not be taken when zap_pte_range() + * calls __split_huge_pte() while i_mmap_lock is held. + */ + if (PageAnon(page)) { + if (unlikely(!trylock_page(page))) { + _pte = *pte; + get_page(page); + spin_unlock(ptl); + lock_page(page); + spin_lock(ptl); + if (unlikely(!pte_same(*pte, _pte))) { + unlock_page(page); + put_page(page); + page = NULL; + goto repeat; + } + put_page(page); + } + locked = true; + } + } + if (PageMlocked(page)) + clear_page_mlock(page); + } else if (!(pte_devmap(*pte) || is_pte_migration_entry(*pte))) + goto out; + __split_huge_pte_locked(vma, pte, range.start, freeze); +out: + spin_unlock(ptl); + if (locked && page) + unlock_page(page); + mmu_notifier_invalidate_range_only_end(&range); +} + +void split_huge_pte_address(struct vm_area_struct *vma, unsigned long address, + bool freeze, struct page *page) +{ + unsigned long haddr = address & HPAGE_CONT_PTE_MASK; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(vma->vm_mm, haddr); + if (!pgd_present(*pgd)) + return; + + p4d = p4d_offset(pgd, haddr); + if (!p4d_present(*p4d)) + return; + + pud = pud_offset(p4d, haddr); + if (!pud_present(*pud)) + return; + + pmd = pmd_offset(pud, haddr); + if (!pmd_present(*pmd)) + return; + + pte = pte_offset_map(pmd, haddr); + if (!pte_present(*pte)) + return; + + __split_huge_pte(vma, pmd, pte, haddr, freeze, page); +} +#endif /* CONFIG_FINEGRAINED_THP */ diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 6aabf1e..a32cc50 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1365,6 +1365,22 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot) return 1; } +#ifdef CONFIG_FINEGRAINED_THP +int cont_pte_set_huge(pte_t *ptep, phys_addr_t phys, pgprot_t prot) +{ + int i; + pte_t new_pte; + + for (i = 0; i < HPAGE_CONT_PTE_NR; i++, phys += PAGE_SIZE, ptep++) { + new_pte = pfn_pte(__phys_to_pfn(phys), prot); + new_pte = pte_mkcont(new_pte); + set_pte(ptep, new_pte); + } + + return 1; +} +#endif /* CONFIG_FINEGRAINED_THP */ + int pud_clear_huge(pud_t *pudp) { if (!pud_sect(READ_ONCE(*pudp))) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 887a553..9a782664 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -130,15 +130,33 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_TRANSPARENT_HUGEPAGE show_val_kb(m, "AnonHugePages: ", global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR); +#ifdef CONFIG_FINEGRAINED_THP + show_val_kb(m, "Anon64KBPages: ", + global_node_page_state(NR_ANON_64KB_THPS) * HPAGE_CONT_PTE_NR); +#endif /* CONFIG_FINEGRAINED_THP */ show_val_kb(m, "ShmemHugePages: ", global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); show_val_kb(m, "ShmemPmdMapped: ", global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); +#ifdef CONFIG_FINEGRAINED_THP + show_val_kb(m, "ShmemPteMapped: ", + global_node_page_state(NR_SHMEM_PTEMAPPED) * HPAGE_CONT_PTE_NR); + show_val_kb(m, "File64KBPages: ", + global_node_page_state(NR_FILE_64KB_THPS) * HPAGE_CONT_PTE_NR); +#endif /* CONFIG_FINEGRAINED_THP */ show_val_kb(m, "FileHugePages: ", global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR); show_val_kb(m, "FilePmdMapped: ", global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR); +#ifdef CONFIG_FINEGRAINED_THP + show_val_kb(m, "FileCPteMapped: ", + global_node_page_state(NR_FILE_PTEMAPPED) * HPAGE_CONT_PTE_NR); +#endif /* CONFIG_FINEGRAINED_THP */ #endif + show_val_kb(m, "PhysCPteMapped: ", + phys_cont_pte_pages()); + show_val_kb(m, "PhysPmdMapped: ", + phys_huge_pmd_pages() * HPAGE_PMD_NR); #ifdef CONFIG_CMA show_val_kb(m, "CmaTotal: ", totalcma_pages); diff --git a/include/asm-generic/finegrained_thp.h b/include/asm-generic/finegrained_thp.h new file mode 100644 index 0000000..08a3461 --- /dev/null +++ b/include/asm-generic/finegrained_thp.h @@ -0,0 +1,8 @@ +/* a generic header for fine-grained thp */ +#ifndef __ASM_FINEGRAINED_THP_H +#define __ASM_FINEGRAINED_THP_H +static inline void khugepaged_mem_hook(struct mm_struct *mm, + unsigned long addr, long diff, const char *debug) +{} +#endif /* CONFIG_FINEGRAINED_THP */ +#endif /* __ASM_FINEGRAINED_THP_H */ diff --git a/include/asm-generic/huge_mm.h b/include/asm-generic/huge_mm.h new file mode 100644 index 0000000..48527cf --- /dev/null +++ b/include/asm-generic/huge_mm.h @@ -0,0 +1,57 @@ +/* a generic header for architecture-dependent hugepage */ +#ifndef __ASM_HUGE_MM_H +#define __ASM_HUGE_MM_H +#ifndef CONFIG_FINEGRAINED_THP +static inline int arch_do_wp_page(struct vm_fault *vmf, pte_t entry) +{ + return VM_FAULT_FALLBACK; +} + +static inline bool arch_huge_pte_set_accessed(struct vm_fault *vmf, + pte_t entry) +{ + return false; +} + +static inline pte_t arch_pte_clearhuge(pte_t pte) +{ + return pte; +} + +static inline pte_t arch_make_huge_pte(struct page *hpage, + struct vm_area_struct *vma) +{ + return mk_pte(hpage, vma->vm_page_prot); +} + +static inline void khugepaged_mem_hook(struct mm_struct *mm, + unsigned long addr, long diff, const char *debug) +{} + +static inline vm_fault_t arch_do_set_huge_pte(struct vm_fault *vmf, + struct page *page) +{} + +static inline void arch_set_huge_pte_at(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, pte_t pte, unsigned long headoff) +{} + +static inline void arch_clear_huge_pte_range(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{} + +static inline bool arch_hugepage_vma_shmem_check( + struct vm_area_struct *vma, + unsigned long vm_flags) +{ + return false; +} + +static inline bool arch_hugepage_vma_file_check( + struct vm_area_struct *vma, + unsigned long vm_flags) +{ + return false; +} +#endif /* CONFIG_FINGRAINED_THP */ +#endif /* __ASM_HUGE_MM_H */ diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 0365aa9..4f8818c6 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -6,6 +6,9 @@ #include #include /* only for vma_is_dax() */ +#ifdef CONFIG_FINEGRAINED_THP +#include /* for compound_order/compound_nr */ +#endif extern vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -272,8 +275,13 @@ static inline struct page *thp_head(struct page *page) static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); +#ifdef CONFIG_FINEGRAINED_THP + if (PageHead(page)) + return page[1].compound_order; +#else if (PageHead(page)) return HPAGE_PMD_ORDER; +#endif return 0; } @@ -285,7 +293,11 @@ static inline int thp_nr_pages(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); if (PageHead(page)) +#ifdef CONFIG_FINEGRAINED_THP + return page[1].compound_nr; +#else return HPAGE_PMD_NR; +#endif return 1; } diff --git a/include/linux/mm.h b/include/linux/mm.h index b8eadd9..de2371d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2803,6 +2803,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ #define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ +#define FOLL_SPLIT_PTE 0x100000 /* pslit huge pte before returning */ + /* * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each * other. Here is what they mean, and how to use them: @@ -3151,6 +3153,9 @@ static inline int pages_identical(struct page *page1, struct page *page2) return !memcmp_pages(page1, page2); } +extern unsigned long phys_cont_pte_pages(void); +extern unsigned long phys_huge_pmd_pages(void); + #ifdef CONFIG_MAPPING_DIRTY_HELPERS unsigned long clean_record_shared_mapping_range(struct address_space *mapping, pgoff_t first_index, pgoff_t nr, diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index b820078..104ff57 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -592,6 +592,21 @@ static inline void mmu_notifier_range_init_migrate( ___pte; \ }) +#ifdef CONFIG_FINEGRAINED_THP +#define ptep_huge_clear_flush_notify(__vma, __address, __ptep) \ +({ \ + unsigned long ___addr = __address & HPAGE_CONT_PTE_MASK; \ + struct mm_struct *___mm = (__vma)->vm_mm; \ + pte_t ___pte; \ + \ + ___pte = ptep_huge_clear_flush(__vma, __address, __ptep); \ + mmu_notifier_invalidate_range(___mm, ___addr, \ + ___addr + HPAGE_CONT_PTE_SIZE); \ + \ + ___pte; \ +}) +#endif /* CONFIG_FINEGRAINED_THP */ + #define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd) \ ({ \ unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ @@ -737,6 +752,10 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) #define pudp_huge_clear_flush_notify pudp_huge_clear_flush #define set_pte_at_notify set_pte_at +#ifdef CONFIG_FINEGRAINED_THP +#define ptep_huge_clear_flush_notify ptep_huge_clear_flush +#endif + static inline void mmu_notifier_synchronize(void) { } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9d0c454..26df92e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -193,9 +193,19 @@ enum node_stat_item { NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_SHMEM_THPS, NR_SHMEM_PMDMAPPED, +#ifdef CONFIG_FINEGRAINED_THP + NR_SHMEM_PTEMAPPED, + NR_FILE_64KB_THPS, +#endif /* CONFIG_FINEGRAINED_THP */ NR_FILE_THPS, +#ifdef CONFIG_FINEGRAINED_THP + NR_FILE_PTEMAPPED, +#endif /* CONFIG_FINEGRAINED_THP */ NR_FILE_PMDMAPPED, NR_ANON_THPS, +#ifdef CONFIG_FINEGRAINED_THP + NR_ANON_64KB_THPS, +#endif NR_VMSCAN_WRITE, NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 7c869ea..62a80bf 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1300,7 +1300,9 @@ static inline int p4d_clear_huge(p4d_t *p4d) return 0; } #endif /* !__PAGETABLE_P4D_FOLDED */ - +#ifdef CONFIG_FINEGRAINED_THP +int cont_pte_set_huge(pte_t *pte, phys_addr_t addr, pgprot_t prot); +#endif /* CONFIG_FINEGRAINED_THP */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); int pud_clear_huge(pud_t *pud); @@ -1309,6 +1311,12 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr); int pud_free_pmd_page(pud_t *pud, unsigned long addr); int pmd_free_pte_page(pmd_t *pmd, unsigned long addr); #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ +#ifdef CONFIG_FINEGRAINED_THP +static inline int cont_pte_set_huge(pte_t *pte, phys_addr_t addr, pgprot_t prot) +{ + return 0; +} +#endif /* CONFIG_FINEGRAINED_THP */ static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; diff --git a/include/linux/rmap.h b/include/linux/rmap.h index def5c62..6d6f374 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -98,6 +98,9 @@ enum ttu_flags { TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock: * caller holds it */ TTU_SPLIT_FREEZE = 0x100, /* freeze pte under splitting thp */ +#ifdef CONFIG_FINEGRAINED_THP + TTU_SPLIT_HUGE_PTE = 0x200, /* split huge PTE if any */ +#endif }; #ifdef CONFIG_MMU diff --git a/include/linux/swapops.h b/include/linux/swapops.h index d9b7c91..71aa4b7 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -274,6 +274,12 @@ static inline int is_pmd_migration_entry(pmd_t pmd) { return !pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); } +#ifdef CONFIG_FINEGRAINED_THP +static inline int is_pte_migration_entry(pte_t pte) +{ + return !pte_present(pte) && is_migration_entry(pte_to_swp_entry(pte)); +} +#endif #else static inline void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page) @@ -303,6 +309,13 @@ static inline int is_pmd_migration_entry(pmd_t pmd) { return 0; } + +#ifdef CONFIG_FINEGRAINED_THP +static inline int is_pte_migration_entry(pte_t pte) +{ + return 0; +} +#endif /* CONFIG_FINEGRAINED_THP */ #endif #ifdef CONFIG_MEMORY_FAILURE diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 21d7c7f..77e4bdd 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -95,6 +95,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_SPLIT_PAGE, THP_SPLIT_PAGE_FAILED, THP_DEFERRED_SPLIT_PAGE, +#ifdef CONFIG_FINEGRAINED_THP + THP_SPLIT_CONT_PTE, +#endif THP_SPLIT_PMD, #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD THP_SPLIT_PUD, diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index f94f65d..f5d33b8 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -30,6 +30,10 @@ #define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ +#ifdef CONFIG_FINEGRAINED_THP +#define MAP_FILE_THP 0x200000 /* MAP_FIXED which doesn't unmap underlying mapping */ +#endif + #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index c99de4a..f3bb8b2 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -177,6 +177,7 @@ endchoice config CMA_ALIGNMENT int "Maximum PAGE_SIZE order of alignment for contiguous buffers" + range 9 12 if FINEGRAINED_THP range 2 12 default 8 help diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 00b03587..50bc0fd 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -475,7 +475,14 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, retry: if (is_register) +#ifdef CONFIG_FINEGRAINED_THP + { + gup_flags |= FOLL_SPLIT_PMD | FOLL_SPLIT_PTE; + pr_info("THP-%s: FOLL_SPLIT_PTE called comm(%s)\n", __func__, current->comm); + } +#else /* CONFIG_FINEGRAINED_THP */ gup_flags |= FOLL_SPLIT_PMD; +#endif /* CONFIG_FINEGRAINED_THP */ /* Read the page with vaddr into memory */ ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &old_page, &vma, NULL); diff --git a/mm/Kconfig b/mm/Kconfig index ffcae7b..3965f52 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -875,6 +875,23 @@ config READ_ONLY_THP_FOR_FS support of file THPs will be developed in the next few release cycles. +config FINEGRAINED_THP + bool "Fine-grained THP support (EXPERIMENTAL)" + depends on TRANSPARENT_HUGEPAGE + + help + Allow khugepaged to create 64KB hugepages and 64KB hugepage + allocation on page faults. + + It is only supported by ARM64 architecture for now. + +config THP_CONSERVATIVE + bool "A conservative policy for fTHP (EXPERIMENTAL)" + depends on FINEGRAINED_THP + + help + In the conservative policy, only khugepaged can make hugepages + config ARCH_HAS_PTE_SPECIAL bool diff --git a/mm/filemap.c b/mm/filemap.c index 125b69f..02099ca 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -206,7 +206,14 @@ static void unaccount_page_cache_page(struct address_space *mapping, if (PageTransHuge(page)) __dec_node_page_state(page, NR_SHMEM_THPS); } else if (PageTransHuge(page)) { +#ifdef CONFIG_FINEGRAINED_THP + if (thp_nr_pages(page) == HPAGE_PMD_NR) + __dec_node_page_state(page, NR_FILE_THPS); + else + __dec_node_page_state(page, NR_FILE_64KB_THPS); +#else /* CONFIG_FINEGRAINED_THP */ __dec_node_page_state(page, NR_FILE_THPS); +#endif /* CONFIG_FINEGRAINED_THP */ filemap_nr_thps_dec(mapping); } diff --git a/mm/gup.c b/mm/gup.c index 054ff92..cd32ef8 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -447,6 +447,10 @@ retry: return ERR_PTR(ret); goto retry; } +#ifdef CONFIG_FINEGRAINED_THP + else if (flags & FOLL_SPLIT_PTE && pte_cont(pte)) + split_huge_pte(vma, pmd, ptep, address); +#endif /* CONFIG_FINEGRAINED_THP */ /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ if (unlikely(!try_grab_page(page, flags))) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4a78514..20ea663 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -753,6 +753,13 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return __do_huge_pmd_anonymous_page(vmf, page, gfp); } +#ifndef CONFIG_FINEGRAINED_THP +vm_fault_t do_huge_pte_anonymous_page(struct vm_fault *vmf) +{ + return VM_FAULT_FALLBACK; +} +#endif /* CONFIG_FINEGRAINED_THP */ + static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, pgtable_t pgtable) @@ -1109,6 +1116,9 @@ out: return ret; } +#ifdef CONFIG_FINEGRAINED_THP +#endif /* CONFIG_FINEGRAINED_THP */ + #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static void touch_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, int flags) @@ -1660,6 +1670,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (vma_is_special_huge(vma)) { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); + atomic_long_dec(&nr_phys_huge_pmd_pages); spin_unlock(ptl); if (is_huge_zero_pmd(orig_pmd)) tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); @@ -2183,6 +2194,61 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } } +static int thp_pte_alloc_locked(struct mm_struct *mm, pmd_t *pmd) +{ + pgtable_t new = pte_alloc_one(mm); + if (!new) + return -ENOMEM; + + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ + mm_inc_nr_ptes(mm); + pmd_populate(mm, pmd, new); + new = NULL; + } + if (new) + pte_free(mm, new); + return 0; +} + +static int thp_remap_pte_range_locked(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pte_t *pte; + int err = 0; + + err = thp_pte_alloc_locked(mm, pmd); + if (err) + return err; + + pte = pte_offset_map(pmd, addr); + if (!pte) + return -ENOMEM; + + arch_enter_lazy_mmu_mode(); + do { + BUG_ON(!pte_none(*pte)); + if (!pfn_modify_allowed(pfn, prot)) { + err = -EACCES; + break; + } + + set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); + pfn++; + pte++; + addr += PAGE_SIZE; + } while (addr != end); + arch_leave_lazy_mmu_mode(); + return err; +} + +static inline pgprot_t thp_pmd_pgprot(pmd_t pmd) +{ + unsigned long pfn = pmd_pfn(pmd); + + return __pgprot(pmd_val(pfn_pmd(pfn, __pgprot(0))) ^ pmd_val(pmd)); +} + void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct page *page) { @@ -2209,7 +2275,19 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, } repeat: - if (pmd_trans_huge(*pmd)) { + if (pmd_trans_huge(*pmd) && !vm_normal_page_pmd(vma, address, *pmd)) { + struct mm_struct *mm = vma->vm_mm; + unsigned long haddr = address & HPAGE_PMD_MASK; + pmd_t orig_pmd; + + orig_pmd = pmdp_huge_get_and_clear_full(vma, haddr, pmd, 0); + atomic_long_dec(&nr_phys_huge_pmd_pages); + thp_remap_pte_range_locked(mm, pmd, haddr, + haddr + HPAGE_PMD_SIZE, + pmd_pfn(orig_pmd), + thp_pmd_pgprot(orig_pmd)); + goto out; + } else if (pmd_trans_huge(*pmd) && vm_normal_page_pmd(vma, address, *pmd)) { if (!page) { page = pmd_page(*pmd); /* @@ -2301,7 +2379,12 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, (start & HPAGE_PMD_MASK) >= vma->vm_start && (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) split_huge_pmd_address(vma, start, false, NULL); - +#ifdef CONFIG_FINEGRAINED_THP + if (start & ~HPAGE_CONT_PTE_MASK && + (start & HPAGE_CONT_PTE_MASK) >= vma->vm_start && + (start & HPAGE_CONT_PTE_MASK) + HPAGE_CONT_PTE_SIZE <= vma->vm_end) + split_huge_pte_address(vma, start, false, NULL); +#endif /* * If the new end address isn't hpage aligned and it could * previously contain an hugepage: check if we need to split @@ -2311,6 +2394,12 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, (end & HPAGE_PMD_MASK) >= vma->vm_start && (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) split_huge_pmd_address(vma, end, false, NULL); +#ifdef CONFIG_FINEGRAINED_THP + if (end & ~HPAGE_CONT_PTE_MASK && + (end & HPAGE_CONT_PTE_MASK) >= vma->vm_start && + (end & HPAGE_CONT_PTE_MASK) + HPAGE_CONT_PTE_SIZE <= vma->vm_end) + split_huge_pte_address(vma, end, false, NULL); +#endif /* * If we're also updating the vma->vm_next->vm_start, if the new @@ -2325,17 +2414,34 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, (nstart & HPAGE_PMD_MASK) >= next->vm_start && (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) split_huge_pmd_address(next, nstart, false, NULL); +#ifdef CONFIG_FINEGRAINED_THP + if (nstart & ~HPAGE_CONT_PTE_MASK && + (nstart & HPAGE_CONT_PTE_MASK) >= next->vm_start && + (nstart & HPAGE_CONT_PTE_MASK) + HPAGE_CONT_PTE_SIZE <= next->vm_end) + split_huge_pte_address(next, nstart, false, NULL); +#endif } } static void unmap_page(struct page *page) { +#ifdef CONFIG_FINEGRAINED_THP + enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | + TTU_RMAP_LOCKED; +#else enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; +#endif bool unmap_success; VM_BUG_ON_PAGE(!PageHead(page), page); +#ifdef CONFIG_FINEGRAINED_THP + if (compound_order(page) == HPAGE_PMD_ORDER) + ttu_flags |= TTU_SPLIT_HUGE_PMD; + else + ttu_flags |= TTU_SPLIT_HUGE_PTE; +#endif /* CONFIG_FINEGRAINED_THP */ if (PageAnon(page)) ttu_flags |= TTU_SPLIT_FREEZE; @@ -2720,8 +2826,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mapping) { if (PageSwapBacked(head)) __dec_node_page_state(head, NR_SHMEM_THPS); - else + else { +#ifdef CONFIG_FINEGRAINED_THP + if (thp_nr_pages(head) == HPAGE_CONT_PTE_NR) + __dec_node_page_state(head, NR_FILE_64KB_THPS); + else +#endif /* CONFIG_FINEGRAINED_THP */ __dec_node_page_state(head, NR_FILE_THPS); + } } __split_huge_page(page, list, end, flags); diff --git a/mm/internal.h b/mm/internal.h index c43ccdd..171d962 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -612,6 +612,9 @@ static inline bool is_migrate_highatomic_page(struct page *page) void setup_zone_pageset(struct zone *zone); +extern atomic_long_t nr_phys_cont_pte_pages; +extern atomic_long_t nr_phys_huge_pmd_pages; + struct migration_target_control { int nid; /* preferred node id */ nodemask_t *nmask; diff --git a/mm/ioremap.c b/mm/ioremap.c index 5fa1ab4..50a9121 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -21,6 +21,10 @@ static int __read_mostly ioremap_pud_capable; static int __read_mostly ioremap_pmd_capable; static int __read_mostly ioremap_huge_disabled; +#ifdef CONFIG_FINEGRAINED_THP +static int __read_mostly ioremap_cont_pte_capable; +#endif + static int __init set_nohugeiomap(char *str) { ioremap_huge_disabled = 1; @@ -55,12 +59,45 @@ static inline int ioremap_pmd_enabled(void) return ioremap_pmd_capable; } +#ifdef CONFIG_FINEGRAINED_THP +static inline int ioremap_cont_pte_enabled(void) +{ + return ioremap_cont_pte_capable; +} +#endif /* CONFIG_FINEGRAINED_THP */ + #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ static inline int ioremap_p4d_enabled(void) { return 0; } static inline int ioremap_pud_enabled(void) { return 0; } static inline int ioremap_pmd_enabled(void) { return 0; } +#ifdef CONFIG_FINEGRAINED_THP +static inline int ioremap_cont_pte_enabled(void) { return 0; } +#endif /* CONFIG_FINEGRAINED_THP */ #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ +#ifdef CONFIG_FINEGRAINED_THP +static int ioremap_try_huge_pte(pte_t *pte, unsigned long addr, + unsigned long end, phys_addr_t phys_addr, + pgprot_t prot) +{ + int i; + + if (!ioremap_cont_pte_enabled()) + return 0; + if ((end - addr) != CONT_PTE_SIZE) + return 0; + if (!IS_ALIGNED(addr, CONT_PTE_SIZE)) + return 0; + if (!IS_ALIGNED(phys_addr, CONT_PTE_SIZE)) + return 0; + + for (i = 0; i < HPAGE_CONT_PTE_NR; i++) + if (pte_present(*(pte + i))) + return 0; + return cont_pte_set_huge(pte, phys_addr, prot); +} +#endif /* CONFIG_FINEGRAINED_THP */ + static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, pgtbl_mod_mask *mask) @@ -73,9 +110,23 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, if (!pte) return -ENOMEM; do { +#ifdef CONFIG_FINEGRAINED_THP + if (addr + HPAGE_CONT_PTE_SIZE < end && + ioremap_try_huge_pte(pte, addr, end, phys_addr, prot)) { + pte += HPAGE_CONT_PTE_NR - 1; + pfn += HPAGE_CONT_PTE_NR; + addr += HPAGE_CONT_PTE_SIZE - PAGE_SIZE; + phys_addr += HPAGE_CONT_PTE_SIZE; + continue; + } + +#endif /* CONFIG_FINEGRAINED_THP */ BUG_ON(!pte_none(*pte)); set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); pfn++; +#ifdef CONFIG_FINEGRAINED_THP + phys_addr += PAGE_SIZE; +#endif /* CONFIG_FINEGRAINED_THP */ } while (pte++, addr += PAGE_SIZE, addr != end); *mask |= PGTBL_PTE_MODIFIED; return 0; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index abab394..aa96e8e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -21,6 +21,8 @@ #include #include +#include +#include #include "internal.h" enum scan_result { @@ -78,6 +80,32 @@ static unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; static unsigned int khugepaged_max_ptes_shared __read_mostly; +#ifdef CONFIG_FINEGRAINED_THP +/* + * thp_scan_hint: + * it used for providing hints to khugepaged + * which address space is changed recently. + */ +struct thp_scan_hint { + struct mm_slot *slot; + struct vm_area_struct *vma; + unsigned long diff; /* memory difference */ + unsigned long jiffies; /* time stamp for profiling purpose */ + struct list_head hint_list; +}; + +/* THP type descriptor */ +enum { + THP_TYPE_FAIL, /* cannot make hugepage */ + THP_TYPE_64KB, /* 64KB hugepage can be made, use CONT_PTE */ + THP_TYPE_2MB, /* 2MB hugepage can be made, use PMD */ +}; + +static unsigned int khugepaged_max_ptes_none_64kb __read_mostly; +static unsigned int khugepaged_max_ptes_swap_64kb __read_mostly; +static unsigned int khugepaged_max_ptes_shared_64kb __read_mostly; +#endif /* CONFIG_FINEGRAINED_THP */ + #define MM_SLOTS_HASH_BITS 10 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); @@ -113,10 +141,18 @@ struct khugepaged_scan { struct list_head mm_head; struct mm_slot *mm_slot; unsigned long address; +#ifdef CONFIG_FINEGRAINED_THP + int hpage_type; + int nr_hint; + struct list_head hint_list; +#endif /* CONFIG_FINEGRAINED_THP */ }; static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), +#ifdef CONFIG_FINEGRAINED_THP + .hint_list = LIST_HEAD_INIT(khugepaged_scan.hint_list), +#endif }; #ifdef CONFIG_SYSFS @@ -394,6 +430,11 @@ int __init khugepaged_init(void) khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; +#ifdef CONFIG_FINEGRAINED_THP + khugepaged_max_ptes_none_64kb = HPAGE_CONT_PTE_NR - 1; + khugepaged_max_ptes_swap_64kb = HPAGE_CONT_PTE_NR / 8; + khugepaged_max_ptes_shared_64kb = HPAGE_CONT_PTE_NR / 2; +#endif return 0; } @@ -437,6 +478,10 @@ static inline int khugepaged_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } +#ifdef CONFIG_FINEGRAINED_THP +static void clear_hint_list(struct mm_slot *slot); +#endif /* CONFIG_FINEGRAINED_THP */ + static bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags) { @@ -445,8 +490,11 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; + /* Check arch-dependent shmem hugepage available */ + if (arch_hugepage_vma_shmem_check(vma, vm_flags)) + return true; /* Enabled via shmem mount options or sysfs settings. */ - if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) { + else if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) { return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, HPAGE_PMD_NR); } @@ -455,8 +503,11 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) return false; + /* Check arch-dependent file hugepage available */ + if (arch_hugepage_vma_file_check(vma, vm_flags)) + return true; /* Read-only file mappings need to be aligned for THP to work. */ - if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file && + else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file && (vm_flags & VM_DENYWRITE)) { return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, HPAGE_PMD_NR); @@ -519,6 +570,12 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma, hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) return khugepaged_enter(vma, vm_flags); +#ifdef CONFIG_FINEGRAINED_THP + hstart = (vma->vm_start + ~HPAGE_CONT_PTE_MASK) & HPAGE_CONT_PTE_MASK; + hend = vma->vm_end & HPAGE_CONT_PTE_MASK; + if (hstart < hend) + return khugepaged_enter(vma, vm_flags); +#endif /* CONFIG_FINEGRAINED_THP */ return 0; } @@ -530,6 +587,9 @@ void __khugepaged_exit(struct mm_struct *mm) spin_lock(&khugepaged_mm_lock); mm_slot = get_mm_slot(mm); if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { +#ifdef CONFIG_FINEGRAINED_THP + clear_hint_list(mm_slot); +#endif hash_del(&mm_slot->hash); list_del(&mm_slot->mm_node); free = 1; @@ -594,23 +654,56 @@ static bool is_refcount_suitable(struct page *page) return page_count(page) == expected_refcount; } +#ifdef CONFIG_FINEGRAINED_THP +static int __collapse_huge_page_isolate(struct vm_area_struct *vma, + unsigned long address, + pte_t *pte, + struct list_head *compound_pagelist, + int hpage_type) +#else /* CONFIG_FINEGRAINED_THP */ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte, struct list_head *compound_pagelist) +#endif /* CONFIG_FINEGRAINED_THP */ { struct page *page = NULL; pte_t *_pte; int none_or_zero = 0, shared = 0, result = 0, referenced = 0; bool writable = false; +#ifdef CONFIG_FINEGRAINED_THP + int max_ptes_shared, max_ptes_none; + int hpage_nr; + + if (hpage_type == THP_TYPE_64KB) { + hpage_nr = HPAGE_CONT_PTE_NR; + max_ptes_shared = khugepaged_max_ptes_shared_64kb; + max_ptes_none = khugepaged_max_ptes_none_64kb; + } else { + hpage_nr = HPAGE_PMD_NR; + max_ptes_shared = khugepaged_max_ptes_shared; + max_ptes_none = khugepaged_max_ptes_none; + } +#endif /* CONFIG_FINEGRAINED_THP */ - for (_pte = pte; _pte < pte+HPAGE_PMD_NR; + for (_pte = pte; +#ifdef CONFIG_FINEGRAINED_THP + _pte < pte + hpage_nr; +#else + _pte < pte+HPAGE_PMD_NR; +#endif _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { +#ifdef CONFIG_FINEGRAINED_THP if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { + ++none_or_zero <= max_ptes_none) +#else /* CONFIG_FINEGRAINED_THP */ + if (!userfaultfd_armed(vma) && + ++none_or_zero <= khugepaged_max_ptes_none) +#endif /* CONFIG_FINEGRAINED_THP */ + { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -629,8 +722,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageAnon(page), page); +#ifdef CONFIG_FINEGRAINED_THP + if (page_mapcount(page) > 1 && + ++shared > max_ptes_shared) +#else /* CONFIG_FINEGRAINED_THP */ if (page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { + ++shared > khugepaged_max_ptes_shared) +#endif /* CONFIG_FINEGRAINED_THP */ + { result = SCAN_EXCEED_SHARED_PTE; goto out; } @@ -732,15 +831,34 @@ out: return 0; } +#ifdef CONFIG_FINEGRAINED_THP +static void __collapse_huge_page_copy(pte_t *pte, struct page *page, + struct vm_area_struct *vma, + unsigned long address, + spinlock_t *ptl, + struct list_head *compound_pagelist, + int hpage_type) +#else /* CONFIG_FINEGRAINED_THP */ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, struct vm_area_struct *vma, unsigned long address, spinlock_t *ptl, struct list_head *compound_pagelist) +#endif /* CONFIG_FINEGRAINED_THP */ { struct page *src_page, *tmp; pte_t *_pte; - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; +#ifdef CONFIG_FINEGRAINED_THP + int hpage_nr = (hpage_type == THP_TYPE_64KB ? + HPAGE_CONT_PTE_NR : HPAGE_PMD_NR); +#endif + + for (_pte = pte; +#ifdef CONFIG_FINEGRAINED_THP + _pte < pte + hpage_nr; +#else + _pte < pte + HPAGE_PMD_NR; +#endif _pte++, page++, address += PAGE_SIZE) { pte_t pteval = *_pte; @@ -894,12 +1012,21 @@ static int khugepaged_find_target_node(void) return 0; } +#ifdef CONFIG_FINEGRAINED_THP +static inline struct page *alloc_khugepaged_hugepage(int hpage_order) +#else static inline struct page *alloc_khugepaged_hugepage(void) +#endif { struct page *page; +#ifdef CONFIG_FINEGRAINED_THP + page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), + hpage_order); +#else page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), HPAGE_PMD_ORDER); +#endif if (page) prep_transhuge_page(page); return page; @@ -910,7 +1037,11 @@ static struct page *khugepaged_alloc_hugepage(bool *wait) struct page *hpage; do { +#ifdef CONFIG_FINEGRAINED_THP + hpage = alloc_khugepaged_hugepage(HPAGE_PMD_ORDER); +#else hpage = alloc_khugepaged_hugepage(); +#endif if (!hpage) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); if (!*wait) @@ -948,6 +1079,21 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) return true; } +#ifdef CONFIG_FINEGRAINED_THP +static struct page * +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node, int hpage_type) +{ + struct page *page; + + if (hpage_type == THP_TYPE_64KB) + page = alloc_khugepaged_hugepage(HPAGE_CONT_PTE_ORDER); + else { + VM_BUG_ON(!*hpage); + page = *hpage; + } + return page; +} +#else /* CONFIG_FINEGRAINED_THP */ static struct page * khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) { @@ -955,6 +1101,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) return *hpage; } +#endif /* CONFIG_FINEGRAINED_THP */ #endif /* @@ -964,8 +1111,13 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) * value (scan code). */ +#ifdef CONFIG_FINEGRAINED_THP +static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, + struct vm_area_struct **vmap, int hpage_type) +#else static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, struct vm_area_struct **vmap) +#endif { struct vm_area_struct *vma; unsigned long hstart, hend; @@ -977,6 +1129,17 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!vma) return SCAN_VMA_NULL; +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB) { + hstart = (vma->vm_start + ~HPAGE_CONT_PTE_MASK) & HPAGE_CONT_PTE_MASK; + hend = vma->vm_end & HPAGE_CONT_PTE_MASK; + if (address < hstart || address + HPAGE_CONT_PTE_SIZE > hend) + return SCAN_ADDRESS_RANGE; + if (!hugepage_vma_check(vma, vma->vm_flags)) + return SCAN_VMA_CHECK; + return 0; + } +#endif /* CONFIG_FINEGRAINED_THP */ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (address < hstart || address + HPAGE_PMD_SIZE > hend) @@ -997,10 +1160,17 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, * but with mmap_lock held to protect against vma changes. */ +#ifdef CONFIG_FINEGRAINED_THP +static bool __collapse_huge_page_swapin(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + int referenced, int hpage_type) +#else /* CONFIG_FINEGRAINED_THP */ static bool __collapse_huge_page_swapin(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, int referenced) +#endif /* CONFIG_FINEGRAINED_THP */ { int swapped_in = 0; vm_fault_t ret = 0; @@ -1011,9 +1181,18 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, .pmd = pmd, .pgoff = linear_page_index(vma, address), }; +#ifdef CONFIG_FINEGRAINED_THP + int hpage_size = (hpage_type == THP_TYPE_64KB) ? + HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE; +#endif vmf.pte = pte_offset_map(pmd, address); - for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; + for (; +#ifdef CONFIG_FINEGRAINED_THP + vmf.address < address + hpage_size; +#else + vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; +#endif vmf.pte++, vmf.address += PAGE_SIZE) { vmf.orig_pte = *vmf.pte; if (!is_swap_pte(vmf.orig_pte)) @@ -1024,7 +1203,12 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */ if (ret & VM_FAULT_RETRY) { mmap_read_lock(mm); - if (hugepage_vma_revalidate(mm, address, &vmf.vma)) { +#ifdef CONFIG_FINEGRAINED_THP + if (hugepage_vma_revalidate(mm, address, &vmf.vma, hpage_type)) +#else + if (hugepage_vma_revalidate(mm, address, &vmf.vma)) +#endif + { /* vma is no longer available, don't continue to swapin */ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; @@ -1053,10 +1237,18 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, return true; } +#ifdef CONFIG_FINEGRAINED_THP +static void collapse_huge_page(struct mm_struct *mm, + unsigned long address, + struct page **hpage, + int node, int referenced, int unmapped, + int hpage_type) +#else /* CONFIG_FINEGRAINED_THP */ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, int node, int referenced, int unmapped) +#endif /* CONFIG_FINEGRAINED_THP */ { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; @@ -1069,7 +1261,14 @@ static void collapse_huge_page(struct mm_struct *mm, struct mmu_notifier_range range; gfp_t gfp; +#ifdef CONFIG_FINEGRAINED_THP + pte_t _pte; + + VM_BUG_ON(address & (hpage_type == THP_TYPE_64KB ? + ~HPAGE_CONT_PTE_MASK : ~HPAGE_PMD_MASK)); +#else VM_BUG_ON(address & ~HPAGE_PMD_MASK); +#endif /* Only allocate from the target node */ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; @@ -1081,7 +1280,11 @@ static void collapse_huge_page(struct mm_struct *mm, * that. We will recheck the vma after taking it again in write mode. */ mmap_read_unlock(mm); +#ifdef CONFIG_FINEGRAINED_THP + new_page = khugepaged_alloc_page(hpage, gfp, node, hpage_type); +#else new_page = khugepaged_alloc_page(hpage, gfp, node); +#endif if (!new_page) { result = SCAN_ALLOC_HUGE_PAGE_FAIL; goto out_nolock; @@ -1094,7 +1297,11 @@ static void collapse_huge_page(struct mm_struct *mm, count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); mmap_read_lock(mm); +#ifdef CONFIG_FINEGRAINED_THP + result = hugepage_vma_revalidate(mm, address, &vma, hpage_type); +#else result = hugepage_vma_revalidate(mm, address, &vma); +#endif if (result) { mmap_read_unlock(mm); goto out_nolock; @@ -1112,11 +1319,19 @@ static void collapse_huge_page(struct mm_struct *mm, * If it fails, we release mmap_lock and jump out_nolock. * Continuing to collapse causes inconsistency. */ +#ifdef CONFIG_FINEGRAINED_THP + if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, + pmd, referenced, hpage_type)) { + mmap_read_unlock(mm); + goto out_nolock; + } +#else /* CONFIG_FINEGRAINED_THP */ if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) { mmap_read_unlock(mm); goto out_nolock; } +#endif /* CONFIG_FINEGRAINED_THP*/ mmap_read_unlock(mm); /* @@ -1125,7 +1340,11 @@ static void collapse_huge_page(struct mm_struct *mm, * handled by the anon_vma lock + PG_lock. */ mmap_write_lock(mm); +#ifdef CONFIG_FINEGRAINED_THP + result = hugepage_vma_revalidate(mm, address, &vma, hpage_type); +#else result = hugepage_vma_revalidate(mm, address, &vma); +#endif if (result) goto out; /* check if the pmd is still valid */ @@ -1134,8 +1353,14 @@ static void collapse_huge_page(struct mm_struct *mm, anon_vma_lock_write(vma->anon_vma); +#ifdef CONFIG_FINEGRAINED_THP + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, + address, address + (hpage_type == THP_TYPE_64KB ? + HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE)); +#else mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, address, address + HPAGE_PMD_SIZE); +#endif mmu_notifier_invalidate_range_start(&range); pte = pte_offset_map(pmd, address); @@ -1148,16 +1373,38 @@ static void collapse_huge_page(struct mm_struct *mm, * huge and small TLB entries for the same virtual address * to avoid the risk of CPU bugs in that area. */ - _pmd = pmdp_collapse_flush(vma, address, pmd); +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB) + /* FIXME: clearing ptes here causes + * __collapse_huge_page_isolate and __collapse_huge_page_copy + * to fail, __collapse_huge_page_copy also clears ptes + */ + ; + else +#endif /* CONFIG_FINEGRAINED_THP */ + _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(&range); spin_lock(pte_ptl); +#ifdef CONFIG_FINEGRAINED_THP + isolated = __collapse_huge_page_isolate(vma, address, pte, + &compound_pagelist, hpage_type); +#else /* CONFIG_FINEGRAINED_THP */ isolated = __collapse_huge_page_isolate(vma, address, pte, &compound_pagelist); +#endif /* CONFIG_FINEGRAINED_THP */ spin_unlock(pte_ptl); if (unlikely(!isolated)) { +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB) { + pte_unmap(pte); + anon_vma_unlock_write(vma->anon_vma); + result = SCAN_FAIL; + goto out; + } +#endif /* CONFIG_FINEGRAINED_THP */ pte_unmap(pte); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); @@ -1179,15 +1426,34 @@ static void collapse_huge_page(struct mm_struct *mm, */ anon_vma_unlock_write(vma->anon_vma); +#ifdef CONFIG_FINEGRAINED_THP + __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, + &compound_pagelist, hpage_type); +#else /* CONFIG_FINEGRAINED_THP */ __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, &compound_pagelist); +#endif /* CONFIG_FINEGRAINED_THP */ pte_unmap(pte); __SetPageUptodate(new_page); + +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB) { + /* 64KB hugepage */ + _pte = arch_make_huge_pte(new_page, vma); + _pte = maybe_mkwrite(pte_mkdirty(_pte), vma); + } else { + /* 2MB hugepage */ + pgtable = pmd_pgtable(_pmd); + + _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); + _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); + } +#else /* CONFIG_FINEGRAINED_THP */ pgtable = pmd_pgtable(_pmd); _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); - +#endif /* CONFIG_FINEGRAINED_THP */ /* * spin_lock() below is not the equivalent of smp_wmb(), so * this is needed to avoid the copy_huge_page writes to become @@ -1196,15 +1462,32 @@ static void collapse_huge_page(struct mm_struct *mm, smp_wmb(); spin_lock(pmd_ptl); - BUG_ON(!pmd_none(*pmd)); +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_2MB) +#endif + BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address, true); lru_cache_add_inactive_or_unevictable(new_page, vma); + +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB) + arch_set_huge_pte_at(mm, address, pte, _pte, 0); + else { + pgtable_trans_huge_deposit(mm, pmd, pgtable); + set_pmd_at(mm, address, pmd, _pmd); + } + update_mmu_cache_pmd(vma, address, pmd); +#else /* CONFIG_FINEGRAINED_THP */ pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); +#endif /* CONFIG_FINEGRAINED_THP */ spin_unlock(pmd_ptl); - *hpage = NULL; +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_2MB) +#endif + *hpage = NULL; khugepaged_pages_collapsed++; result = SCAN_SUCCEED; @@ -1213,16 +1496,27 @@ out_up_write: out_nolock: if (!IS_ERR_OR_NULL(*hpage)) mem_cgroup_uncharge(*hpage); +#ifdef CONFIG_FINEGRAINED_THP + if (result != SCAN_SUCCEED && new_page && hpage_type == THP_TYPE_64KB) + put_page(new_page); +#endif trace_mm_collapse_huge_page(mm, isolated, result); return; out: goto out_up_write; } +#ifdef CONFIG_FINEGRAINED_THP +static int khugepaged_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + struct page **hpage, int hpage_type) +#else /* CONFIG_FINEGRAINED_THP */ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, struct page **hpage) +#endif /* CONFIG_FINEGRAINED_THP */ { pmd_t *pmd; pte_t *pte, *_pte; @@ -1234,7 +1528,26 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, int node = NUMA_NO_NODE, unmapped = 0; bool writable = false; +#ifdef CONFIG_FINEGRAINED_THP + int hpage_nr; + int max_ptes_swap, max_ptes_none, max_ptes_shared; + + if (hpage_type == THP_TYPE_64KB) { + VM_BUG_ON(address & ~HPAGE_CONT_PTE_MASK); + hpage_nr = HPAGE_CONT_PTE_NR; + max_ptes_swap = khugepaged_max_ptes_swap_64kb; + max_ptes_none = khugepaged_max_ptes_none_64kb; + max_ptes_shared = khugepaged_max_ptes_shared_64kb; + } else { + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + hpage_nr = HPAGE_PMD_NR; + max_ptes_swap = khugepaged_max_ptes_swap; + max_ptes_none = khugepaged_max_ptes_none; + max_ptes_shared = khugepaged_max_ptes_shared; + } +#else /* CONFIG_FINEGRAINED_THP */ VM_BUG_ON(address & ~HPAGE_PMD_MASK); +#endif /* CONFIG_FINEGRAINED_THP */ pmd = mm_find_pmd(mm, address); if (!pmd) { @@ -1244,11 +1557,21 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); - for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; + for (_address = address, _pte = pte; +#ifdef CONFIG_FINEGRAINED_THP + _pte < pte + hpage_nr; +#else + _pte < pte+HPAGE_PMD_NR; +#endif _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; if (is_swap_pte(pteval)) { - if (++unmapped <= khugepaged_max_ptes_swap) { +#ifdef CONFIG_FINEGRAINED_THP + if (++unmapped <= max_ptes_swap) +#else + if (++unmapped <= khugepaged_max_ptes_swap) +#endif + { /* * Always be strict with uffd-wp * enabled swap entries. Please see @@ -1266,7 +1589,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, } if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { +#ifdef CONFIG_FINEGRAINED_THP + ++none_or_zero <= max_ptes_none +#else + ++none_or_zero <= khugepaged_max_ptes_none +#endif + ) + { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -1299,8 +1628,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, goto out_unmap; } +#ifdef CONFIG_FINEGRAINED_THP + if (PageCompound(page) && PageTransHuge(compound_head(page))) { + result = SCAN_PAGE_COMPOUND; + goto out_unmap; + } + + if (page_mapcount(page) > 1 && + ++shared > max_ptes_shared) +#else if (page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { + ++shared > khugepaged_max_ptes_shared) +#endif + { result = SCAN_EXCEED_SHARED_PTE; goto out_unmap; } @@ -1371,8 +1711,13 @@ out_unmap: if (ret) { node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_lock released */ +#ifdef CONFIG_FINEGRAINED_THP + collapse_huge_page(mm, address, hpage, node, + referenced, unmapped, hpage_type); +#else collapse_huge_page(mm, address, hpage, node, referenced, unmapped); +#endif } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, @@ -1387,6 +1732,9 @@ static void collect_mm_slot(struct mm_slot *mm_slot) lockdep_assert_held(&khugepaged_mm_lock); if (khugepaged_test_exit(mm)) { +#ifdef CONFIG_FINEGRAINED_THP + clear_hint_list(mm_slot); +#endif /* free mm_slot */ hash_del(&mm_slot->hash); list_del(&mm_slot->mm_node); @@ -1408,15 +1756,29 @@ static void collect_mm_slot(struct mm_slot *mm_slot) * Notify khugepaged that given addr of the mm is pte-mapped THP. Then * khugepaged should try to collapse the page table. */ +#ifdef CONFIG_FINEGRAINED_THP +static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr, int hpage_type) +#else static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) +#endif { struct mm_slot *mm_slot; +#ifdef CONFIG_FINEGRAINED_THP + VM_BUG_ON(addr & (hpage_type == THP_TYPE_64KB ? + ~HPAGE_CONT_PTE_MASK :~HPAGE_PMD_MASK)); +#else VM_BUG_ON(addr & ~HPAGE_PMD_MASK); +#endif spin_lock(&khugepaged_mm_lock); mm_slot = get_mm_slot(mm); +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB) + addr |= 0x01; +#endif if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; spin_unlock(&khugepaged_mm_lock); @@ -1440,10 +1802,26 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) spinlock_t *ptl; int count = 0; int i; +#ifdef CONFIG_FINEGRAINED_THP + int hpage_type = (addr & 0x01) ? THP_TYPE_64KB : THP_TYPE_2MB; + int hpage_nr = (hpage_type == THP_TYPE_64KB) ? + HPAGE_CONT_PTE_NR : HPAGE_PMD_NR; + int hpage_size = (hpage_type == THP_TYPE_64KB) ? + HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE; + + if (hpage_type == THP_TYPE_64KB) + haddr = addr & HPAGE_CONT_PTE_MASK; +#endif +#ifdef CONFIG_FINEGRAINED_THP + if (!vma || !vma->vm_file || + vma->vm_start > haddr || vma->vm_end < haddr + hpage_size) + return; +#else /* CONFIG_FINEGRAINED_THP */ if (!vma || !vma->vm_file || vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE) return; +#endif /* CONFIG_FINEGRAINED_THP */ /* * This vm_flags may not have VM_HUGEPAGE if the page was not @@ -1470,7 +1848,12 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) /* step 1: check all mapped PTEs are to the right huge page */ for (i = 0, addr = haddr, pte = start_pte; - i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { +#ifdef CONFIG_FINEGRAINED_THP + i < hpage_nr; +#else + i < HPAGE_PMD_NR; +#endif + i++, addr += PAGE_SIZE, pte++) { struct page *page; /* empty pte, skip */ @@ -1494,7 +1877,12 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) /* step 2: adjust rmap */ for (i = 0, addr = haddr, pte = start_pte; - i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { +#ifdef CONFIG_FINEGRAINED_THP + i < hpage_nr; +#else + i < HPAGE_PMD_NR; +#endif + i++, addr += PAGE_SIZE, pte++) { struct page *page; if (pte_none(*pte)) @@ -1513,10 +1901,23 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) /* step 4: collapse pmd */ ptl = pmd_lock(vma->vm_mm, pmd); +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB) { + pte_t *ptep = pte_offset_map(pmd, haddr); + arch_clear_huge_pte_range(vma->vm_mm, haddr, ptep); + spin_unlock(ptl); + } else { + _pmd = pmdp_collapse_flush(vma, haddr, pmd); + spin_unlock(ptl); + mm_dec_nr_ptes(mm); + pte_free(mm, pmd_pgtable(_pmd)); + } +#else /* CONFIG_FINEGRAINED_THP*/ _pmd = pmdp_collapse_flush(vma, haddr, pmd); spin_unlock(ptl); mm_dec_nr_ptes(mm); pte_free(mm, pmd_pgtable(_pmd)); +#endif /* CONFIG_FINEGRAINED_THP */ drop_hpage: unlock_page(hpage); @@ -1551,12 +1952,22 @@ out: return 0; } +#ifdef CONFIG_FINEGRAINED_THP +static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff, + int hpage_type) +#else static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) +#endif { struct vm_area_struct *vma; struct mm_struct *mm; unsigned long addr; pmd_t *pmd, _pmd; +#ifdef CONFIG_FINEGRAINED_THP + pte_t *ptep; + int hpage_size = (hpage_type == THP_TYPE_64KB) ? + HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE; +#endif /* CONFIG_FINEGRAINED_THP */ i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1579,6 +1990,45 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) if (vma->anon_vma) continue; addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB && addr & ~HPAGE_CONT_PTE_MASK) + continue; + else if (hpage_type == THP_TYPE_2MB && addr & ~HPAGE_PMD_MASK) + continue; + if (vma->vm_end < addr + hpage_size) + continue; + + mm = vma->vm_mm; + pmd = mm_find_pmd(mm, addr); + if (!pmd) + continue; + if (mmap_write_trylock(mm)) { + spinlock_t *ptl = pmd_lock(mm, pmd); + if (hpage_type == THP_TYPE_64KB) { + /* 64KB hugepage */ + ptep = pte_offset_map(pmd, addr); + /* pte maps are established on page fault handling */ + arch_clear_huge_pte_range(mm, addr, ptep); + spin_unlock(ptl); + } else { + /* 2MB hugepage */ + /* + * We need exclusive mmap_sem to retract page table. + * + * We use trylock due to lock inversion: we need to acquire + * mmap_sem while holding page lock. Fault path does it in + * reverse order. Trylock is a way to avoid deadlock. + */ + _pmd = pmdp_collapse_flush(vma, addr, pmd); + spin_unlock(ptl); + + mm_dec_nr_ptes(mm); + pte_free(mm, pmd_pgtable(_pmd)); + } + mmap_write_unlock(mm); + } else + khugepaged_add_pte_mapped_thp(vma->vm_mm, addr, hpage_type); +#else /* CONFIG_FINEGRAINED_THP */ if (addr & ~HPAGE_PMD_MASK) continue; if (vma->vm_end < addr + HPAGE_PMD_SIZE) @@ -1608,6 +2058,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) /* Try again later */ khugepaged_add_pte_mapped_thp(mm, addr); } +#endif /* CONFIG_FINEGRAINED_THP */ } i_mmap_unlock_write(mapping); } @@ -1630,26 +2081,52 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * + restore gaps in the page cache; * + unlock and free huge page; */ +#ifdef CONFIG_FINEGRAINED_THP +static void collapse_file(struct mm_struct *mm, + struct file *file, pgoff_t start, + struct page **hpage, int node, int hpage_type) +#else /* CONFIG_FINEGRAINED_THP */ static void collapse_file(struct mm_struct *mm, struct file *file, pgoff_t start, struct page **hpage, int node) +#endif /* CONFIG_FINEGRAINED_THP */ { struct address_space *mapping = file->f_mapping; gfp_t gfp; struct page *new_page; +#ifdef CONFIG_FINEGRAINED_THP + int hpage_nr = (hpage_type == THP_TYPE_64KB ? + HPAGE_CONT_PTE_NR : HPAGE_PMD_NR); + int hpage_order = (hpage_type == THP_TYPE_64KB ? + HPAGE_CONT_PTE_ORDER : HPAGE_PMD_ORDER); + pgoff_t index, end = start + hpage_nr; +#else /* CONFIG_FINEGRAINED_THP */ pgoff_t index, end = start + HPAGE_PMD_NR; +#endif /* CONFIG_FINEGRAINED_THP */ LIST_HEAD(pagelist); +#ifdef CONFIG_FINEGRAINED_THP + XA_STATE_ORDER(xas, &mapping->i_pages, start, hpage_order); +#else XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); +#endif int nr_none = 0, result = SCAN_SUCCEED; bool is_shmem = shmem_file(file); VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); +#ifdef CONFIG_FINEGRAINED_THP + VM_BUG_ON(start & (hpage_nr - 1)); +#else VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); +#endif /* Only allocate from the target node */ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; +#ifdef CONFIG_FINEGRAINED_THP + new_page = khugepaged_alloc_page(hpage, gfp, node, hpage_type); +#else new_page = khugepaged_alloc_page(hpage, gfp, node); +#endif if (!new_page) { result = SCAN_ALLOC_HUGE_PAGE_FAIL; goto out; @@ -1857,7 +2334,14 @@ out_unlock: if (is_shmem) __inc_node_page_state(new_page, NR_SHMEM_THPS); else { +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB) + __inc_node_page_state(new_page, NR_FILE_64KB_THPS); + else + __inc_node_page_state(new_page, NR_FILE_THPS); +#else /* CONFIG_FINEGRAINED_THP */ __inc_node_page_state(new_page, NR_FILE_THPS); +#endif /* CONFIG_FINEGRAINED_THP */ filemap_nr_thps_inc(mapping); } @@ -1873,6 +2357,9 @@ xa_unlocked: if (result == SCAN_SUCCEED) { struct page *page, *tmp; +#ifdef CONFIG_FINEGRAINED_THP + int offset = 0; +#endif /* * Replacing old pages with new one has succeeded, now we @@ -1880,12 +2367,28 @@ xa_unlocked: */ index = start; list_for_each_entry_safe(page, tmp, &pagelist, lru) { +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type != THP_TYPE_64KB) { + while (index < page->index) { + clear_highpage(new_page + (index % HPAGE_PMD_NR)); + index++; + } + } + + if (hpage_type == THP_TYPE_64KB) { + copy_highpage(new_page + offset, page); + offset++; + } else + copy_highpage(new_page + (page->index % HPAGE_PMD_NR), + page); +#else /* CONFIG_FINEGRAINED_THP */ while (index < page->index) { clear_highpage(new_page + (index % HPAGE_PMD_NR)); index++; } copy_highpage(new_page + (page->index % HPAGE_PMD_NR), page); +#endif /* CONFIG_FINEGRAINED_THP */ list_del(&page->lru); page->mapping = NULL; page_ref_unfreeze(page, 1); @@ -1895,13 +2398,32 @@ xa_unlocked: put_page(page); index++; } +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB) { + while (index < end) { + clear_highpage(new_page + offset); + offset++; + index++; + } + } else { + while (index < end) { + clear_highpage(new_page + (index % HPAGE_PMD_NR)); + index++; + } + } +#else /* CONFIG_FINEGRAINED_THP */ while (index < end) { clear_highpage(new_page + (index % HPAGE_PMD_NR)); index++; } +#endif /* CONFIG_FINEGRAINED_THP */ SetPageUptodate(new_page); +#ifdef CONFIG_FINEGRAINED_THP + page_ref_add(new_page, hpage_nr - 1); +#else page_ref_add(new_page, HPAGE_PMD_NR - 1); +#endif if (is_shmem) set_page_dirty(new_page); lru_cache_add(new_page); @@ -1909,9 +2431,14 @@ xa_unlocked: /* * Remove pte page tables, so we can re-fault the page as huge. */ +#ifdef CONFIG_FINEGRAINED_THP + retract_page_tables(mapping, start, hpage_type); + if (hpage_type == THP_TYPE_2MB) + *hpage = NULL; +#else /* CONFIG_FINEGRAINED_THP */ retract_page_tables(mapping, start); *hpage = NULL; - +#endif /* CONFIG_FINEGRAINED_THP */ khugepaged_pages_collapsed++; } else { struct page *page; @@ -1956,14 +2483,24 @@ xa_unlocked: unlock_page(new_page); out: +#ifdef CONFIG_FINEGRAINED_THP + if (result != SCAN_SUCCEED && new_page && hpage_type == THP_TYPE_64KB) + put_page(new_page); +#endif VM_BUG_ON(!list_empty(&pagelist)); if (!IS_ERR_OR_NULL(*hpage)) mem_cgroup_uncharge(*hpage); /* TODO: tracepoints */ } +#ifdef CONFIG_FINEGRAINED_THP +static void khugepaged_scan_file(struct mm_struct *mm, + struct file *file, pgoff_t start, struct page **hpage, + int hpage_type) +#else /* CONFIG_FINEGRAINED_THP */ static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, pgoff_t start, struct page **hpage) +#endif /* CONFIG_FINEGRAINED_THP */ { struct page *page = NULL; struct address_space *mapping = file->f_mapping; @@ -1971,17 +2508,43 @@ static void khugepaged_scan_file(struct mm_struct *mm, int present, swap; int node = NUMA_NO_NODE; int result = SCAN_SUCCEED; +#ifdef CONFIG_FINEGRAINED_THP + int hpage_nr; + int max_ptes_swap, max_ptes_none, max_ptes_shared; + + if (hpage_type == THP_TYPE_64KB) { + hpage_nr = HPAGE_CONT_PTE_NR; /* 64KB */ + max_ptes_swap = khugepaged_max_ptes_swap_64kb; + max_ptes_none = khugepaged_max_ptes_none_64kb; + max_ptes_shared = khugepaged_max_ptes_shared_64kb; + } else { + hpage_nr = HPAGE_PMD_NR; /* 2MB */ + max_ptes_swap = khugepaged_max_ptes_swap; + max_ptes_none = khugepaged_max_ptes_none; + max_ptes_shared = khugepaged_max_ptes_shared; + } +#endif /* CONFIG_FINEGRAINED_THP */ present = 0; swap = 0; memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); rcu_read_lock(); - xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { +#ifdef CONFIG_FINEGRAINED_THP + xas_for_each(&xas, page, start + hpage_nr - 1) +#else + xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) +#endif + { if (xas_retry(&xas, page)) continue; if (xa_is_value(page)) { - if (++swap > khugepaged_max_ptes_swap) { +#ifdef CONFIG_FINEGRAINED_THP + if (++swap > max_ptes_swap) +#else + if (++swap > khugepaged_max_ptes_swap) +#endif + { result = SCAN_EXCEED_SWAP_PTE; break; } @@ -2027,19 +2590,34 @@ static void khugepaged_scan_file(struct mm_struct *mm, rcu_read_unlock(); if (result == SCAN_SUCCEED) { - if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { +#ifdef CONFIG_FINEGRAINED_THP + if (present < hpage_nr - max_ptes_none) +#else + if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) +#endif + { result = SCAN_EXCEED_NONE_PTE; } else { node = khugepaged_find_target_node(); +#ifdef CONFIG_FINEGRAINED_THP + collapse_file(mm, file, start, hpage, node, hpage_type); +#else collapse_file(mm, file, start, hpage, node); +#endif } } /* TODO: tracepoints */ } #else +#ifdef CONFIG_FINEGRAINED_THP +static void khugepaged_scan_file(struct mm_struct *mm, + struct file *file, pgoff_t start, struct page **hpage, + int hpage_type) +#else /* CONFIG_FINEGRAINED_THP */ static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, pgoff_t start, struct page **hpage) +#endif /* CONFIG_FINEGRAINED_THP */ { BUILD_BUG(); } @@ -2050,6 +2628,220 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) } #endif +#ifdef CONFIG_FINEGRAINED_THP +/* + * if return value > 0 -> vma can make hugepage + * calculated hugepage start and hugepage end are stored in pointers + * otherwise -> vma cannot make hugepage + */ +static inline int hugepage_determine_htype(unsigned long vm_start, + unsigned long vm_end, unsigned long *hstart, unsigned long *hend) { + unsigned long start, end; + + /* determine 2MB hugepage */ + start = (vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + end = vm_end & HPAGE_PMD_MASK; + if (start >= end) { + /* determine 64KB hugepage */ + start = (vm_start + ~HPAGE_CONT_PTE_MASK) & HPAGE_CONT_PTE_MASK; + end = vm_end & HPAGE_CONT_PTE_MASK; + if (start >= end) + return THP_TYPE_FAIL; + *hstart = start; + *hend = end; + return THP_TYPE_64KB; + } + *hstart = start; + *hend = end; + return THP_TYPE_2MB; +} + +enum { + KHUGEPAGE_SCAN_CONTINUE, + KHUGEPAGE_SCAN_BREAK, + KHUGEPAGE_SCAN_BREAK_MMAP_LOCK, +}; + +static unsigned int khugepaged_scan_vma(struct mm_struct *mm, + struct vm_area_struct *vma, struct page **hpage, + unsigned int pages, int *progress) +{ + unsigned long hstart, hend; + int hpage_type, ret; + int hpage_size, hpage_nr; + + if (!hugepage_vma_check(vma, vma->vm_flags)) + return KHUGEPAGE_SCAN_CONTINUE; + + hpage_type = hugepage_determine_htype( + (vma->vm_start > khugepaged_scan.address) ? + vma->vm_start : khugepaged_scan.address, + vma->vm_end, &hstart, &hend); + + if (hpage_type == THP_TYPE_FAIL) + return KHUGEPAGE_SCAN_CONTINUE; + if (khugepaged_scan.address > hend) + return KHUGEPAGE_SCAN_CONTINUE; + if (khugepaged_scan.address < hstart) + khugepaged_scan.address = hstart; + + if (hpage_type == THP_TYPE_64KB) { + VM_BUG_ON(khugepaged_scan.address & ~HPAGE_CONT_PTE_MASK); + hpage_size = HPAGE_CONT_PTE_SIZE; /* 64KB */ + hpage_nr = HPAGE_CONT_PTE_NR; + } else if (hpage_type == THP_TYPE_2MB) { + VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + hpage_size = HPAGE_PMD_SIZE; /* 2MB */ + hpage_nr = HPAGE_PMD_NR; + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file && + !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, + HPAGE_PMD_NR)) { + /* fallback, vma or file not aligned to 2MB */ + hpage_size = HPAGE_CONT_PTE_SIZE; /* 64KB */ + hpage_nr = HPAGE_CONT_PTE_NR; + hpage_type = THP_TYPE_64KB; + } + } else + BUG(); + + while (khugepaged_scan.address < hend) { + if (khugepaged_scan.address + hpage_size >= hend) { + if (khugepaged_scan.address + HPAGE_CONT_PTE_SIZE < hend) { + hpage_size = HPAGE_CONT_PTE_SIZE; + hpage_nr = HPAGE_CONT_PTE_NR; + hpage_type = THP_TYPE_64KB; + } + } + ret = 0; + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) + return KHUGEPAGE_SCAN_BREAK; + + VM_BUG_ON(khugepaged_scan.address < hstart || + khugepaged_scan.address + hpage_size > + hend); + if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { + struct file *file = get_file(vma->vm_file); + pgoff_t pgoff = linear_page_index(vma, + khugepaged_scan.address); + + mmap_read_unlock(mm); + ret = 1; + khugepaged_scan_file(mm, file, pgoff, hpage, hpage_type); + fput(file); + } else { + ret = khugepaged_scan_pmd(mm, vma, + khugepaged_scan.address, + hpage, hpage_type); + } + /* move to next address */ + khugepaged_scan.address += hpage_size; + *progress += hpage_nr; + if (ret) + /* we released mmap_sem so break loop */ + return KHUGEPAGE_SCAN_BREAK_MMAP_LOCK; + if (*progress >= pages) + return KHUGEPAGE_SCAN_BREAK; + } + return KHUGEPAGE_SCAN_CONTINUE; +} + +static struct thp_scan_hint *find_scan_hint(struct mm_slot *slot, + unsigned long addr) +{ + struct thp_scan_hint *hint; + + list_for_each_entry(hint, &khugepaged_scan.hint_list, hint_list) { + if (hint->slot == slot) + return hint; + } + return NULL; +} + +#ifdef CONFIG_THP_CONSERVATIVE +/* caller must hold a proper mmap_lock */ +void khugepaged_mem_hook(struct mm_struct *mm, unsigned long addr, + long diff, const char *debug) +{ + struct mm_slot *slot; + struct vm_area_struct *vma; + struct thp_scan_hint *hint; + bool wakeup = false; + bool retry = false; + + vma = find_vma(mm, addr); + if (!hugepage_vma_check(vma, vma->vm_flags)) + return; + +again: + spin_lock(&khugepaged_mm_lock); + slot = get_mm_slot(mm); + if (!slot) { + /* make a new slot or go out */ + spin_unlock(&khugepaged_mm_lock); + if (retry) + return; + if (__khugepaged_enter(mm)) + return; + retry = true; + goto again; + } + + hint = find_scan_hint(slot, addr); + if (!hint) { + spin_unlock(&khugepaged_mm_lock); + hint = kzalloc(sizeof(struct thp_scan_hint), GFP_KERNEL); + hint->vma = vma; + hint->slot = slot; + hint->diff = 0; + hint->jiffies = jiffies; + spin_lock(&khugepaged_mm_lock); + list_add(&hint->hint_list, &khugepaged_scan.hint_list); + khugepaged_scan.nr_hint++; + } + hint->diff += diff; + if (hint->diff >= HPAGE_CONT_PTE_SIZE) { + wakeup = true; + //list_move(&hint->hint_list, &khugepaged_scan.hint_list); + } + spin_unlock(&khugepaged_mm_lock); + + /* if possible, wake khugepaged up for starting a scan */ + if (wakeup) { + wake_up_interruptible(&khugepaged_wait); + } +} +#else /* CONFIG_THP_CONSERVATIVE */ +void khugepaged_mem_hook(struct mm_struct *mm, + unsigned long addr, long diff, const char *debug) +{} +#endif /* CONFIG_THP_CONSERVATIVE */ + +static void clear_hint_list(struct mm_slot *slot) +{ + struct thp_scan_hint *hint; + hint = find_scan_hint(slot, 0); + if (hint) { + list_del(&hint->hint_list); + kfree(hint); + khugepaged_scan.nr_hint--; + } +} + +static struct thp_scan_hint *get_next_hint(void) +{ + if (!list_empty(&khugepaged_scan.hint_list)) { + struct thp_scan_hint *hint = list_first_entry( + &khugepaged_scan.hint_list, + struct thp_scan_hint, hint_list); + list_del(&hint->hint_list); + khugepaged_scan.nr_hint--; + return hint; + } + return NULL; +} +#endif /* CONFIG_FINEGRAINED_THP */ + static unsigned int khugepaged_scan_mm_slot(unsigned int pages, struct page **hpage) __releases(&khugepaged_mm_lock) @@ -2063,6 +2855,38 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, VM_BUG_ON(!pages); lockdep_assert_held(&khugepaged_mm_lock); +#ifdef CONFIG_FINEGRAINED_THP + if (khugepaged_scan.mm_slot) + mm_slot = khugepaged_scan.mm_slot; + else if (!list_empty(&khugepaged_scan.hint_list)) { + struct thp_scan_hint *hint; + long mem_diff; + unsigned long jiffies_diff; + +get_next_hint: + hint = get_next_hint(); + if (!hint) + goto get_next_slot; + + mm_slot = hint->slot; + mem_diff = hint->diff; + jiffies_diff = jiffies - hint->jiffies; + kfree(hint); + clear_hint_list(mm_slot); + + if (khugepaged_test_exit(mm_slot->mm)) + goto get_next_hint; + khugepaged_scan.address = 0; + khugepaged_scan.mm_slot = mm_slot; + } else { +get_next_slot: + mm_slot = list_entry(khugepaged_scan.mm_head.next, + struct mm_slot, mm_node); + clear_hint_list(mm_slot); + khugepaged_scan.address = 0; + khugepaged_scan.mm_slot = mm_slot; + } +#else /* CONFIG_FINEGRAINED_THP */ if (khugepaged_scan.mm_slot) mm_slot = khugepaged_scan.mm_slot; else { @@ -2071,6 +2895,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, khugepaged_scan.address = 0; khugepaged_scan.mm_slot = mm_slot; } +#endif /* CONFIG_FINEGRAINED_THP */ spin_unlock(&khugepaged_mm_lock); khugepaged_collapse_pte_mapped_thps(mm_slot); @@ -2087,13 +2912,28 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, progress++; for (; vma; vma = vma->vm_next) { +#ifdef CONFIG_FINEGRAINED_THP + int ret; +#else unsigned long hstart, hend; +#endif cond_resched(); if (unlikely(khugepaged_test_exit(mm))) { progress++; break; } +#ifdef CONFIG_FINEGRAINED_THP + ret = khugepaged_scan_vma(mm, vma, hpage, pages, &progress); + + if (ret == KHUGEPAGE_SCAN_CONTINUE) { + progress++; + continue; + } else if (ret == KHUGEPAGE_SCAN_BREAK) + goto breakouterloop; + else if (ret == KHUGEPAGE_SCAN_BREAK_MMAP_LOCK) + goto breakouterloop_mmap_lock; +#else /* CONFIG_FINEGRAINED_THP */ if (!hugepage_vma_check(vma, vma->vm_flags)) { skip: progress++; @@ -2143,6 +2983,7 @@ skip: if (progress >= pages) goto breakouterloop; } +#endif /* CONFIG_FINEGRAINED_THP */ } breakouterloop: mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */ @@ -2160,6 +3001,53 @@ breakouterloop_mmap_lock: * khugepaged runs here, khugepaged_exit will find * mm_slot not pointing to the exiting mm. */ +#ifdef CONFIG_FINEGRAINED_THP + if (!list_empty(&khugepaged_scan.hint_list)) { + unsigned long jiffies_diff; + long mem_diff; + struct thp_scan_hint *hint; + struct mm_slot *next_slot; + +get_next_hint2: + hint = get_next_hint(); + + if (!hint) { + /* no more hint */ + if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) + goto get_next_slot2; + else + goto end_loop; + } + + mem_diff = hint->diff; + jiffies_diff = jiffies - hint->jiffies; + next_slot = hint->slot; + kfree(hint); + + if (next_slot == mm_slot) + goto get_next_hint2; + + if (!khugepaged_test_exit(next_slot->mm)) { + list_move(&next_slot->mm_node, &mm_slot->mm_node); + clear_hint_list(next_slot); + } else + goto get_next_hint2; + + khugepaged_scan.mm_slot = next_slot; + khugepaged_scan.address = 0; + } else if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { +get_next_slot2: + khugepaged_scan.mm_slot = list_entry( + mm_slot->mm_node.next, + struct mm_slot, mm_node); + clear_hint_list(khugepaged_scan.mm_slot); + khugepaged_scan.address = 0; + } else { +end_loop: + khugepaged_scan.mm_slot = NULL; + khugepaged_full_scans++; + } +#else /* CONFIG_FINEGRAINED_THP */ if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { khugepaged_scan.mm_slot = list_entry( mm_slot->mm_node.next, @@ -2169,7 +3057,7 @@ breakouterloop_mmap_lock: khugepaged_scan.mm_slot = NULL; khugepaged_full_scans++; } - +#endif /* CONFIG_FINEGRAINED_THP */ collect_mm_slot(mm_slot); } @@ -2250,6 +3138,9 @@ static void khugepaged_wait_work(void) wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); } +#include +bool eager_allocation = false; + static int khugepaged(void *none) { struct mm_slot *mm_slot; diff --git a/mm/madvise.c b/mm/madvise.c index 24abc79..fdf4f2a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -407,6 +407,11 @@ regular_page: if (!page) continue; +#ifdef CONFIG_FINEGRAINED_THP + if (pte_cont(ptent)) + split_huge_pte_address(vma, addr, false, NULL); +#endif + /* * Creating a THP page is expensive so split it only if we * are sure it's worth. Split it if we are only owner. @@ -616,6 +621,11 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (!page) continue; +#ifdef CONFIG_FINEGRAINED_THP + if (pte_cont(ptent)) + split_huge_pte_address(vma, addr, false, NULL); +#endif /* CONFIG_FINEGRAINED_THP */ + /* * If pmd isn't transhuge but the page is THP and * is owned by only this process, split it and diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3c99200..429e7385 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3276,16 +3276,26 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) */ void mem_cgroup_split_huge_fixup(struct page *head) { +#ifdef CONFIG_FINEGRAINED_THP + int page_nr = compound_nr(head); +#endif struct mem_cgroup *memcg = head->mem_cgroup; int i; if (mem_cgroup_disabled()) return; +#ifdef CONFIG_FINEGRAINED_THP + for (i = 1; i < page_nr; i++) { + css_get(&memcg->css); + head[i].mem_cgroup = memcg; + } +#else /* CONFIG_FINEGRAINED_THP */ for (i = 1; i < HPAGE_PMD_NR; i++) { css_get(&memcg->css); head[i].mem_cgroup = memcg; } +#endif /* CONFIG_FINEGRAINED_THP */ } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/mm/memory.c b/mm/memory.c index 2e59295..085287f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -82,6 +82,8 @@ #include #include #include +#include +#include #include "pgalloc-track.h" #include "internal.h" @@ -146,6 +148,19 @@ EXPORT_SYMBOL(zero_pfn); unsigned long highest_memmap_pfn __read_mostly; +atomic_long_t nr_phys_cont_pte_pages; +atomic_long_t nr_phys_huge_pmd_pages; + +unsigned long phys_cont_pte_pages(void) +{ + return atomic_long_read(&nr_phys_cont_pte_pages); +} + +unsigned long phys_huge_pmd_pages(void) +{ + return atomic_long_read(&nr_phys_huge_pmd_pages); +} + /* * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() */ @@ -208,6 +223,11 @@ static void check_sync_rss_stat(struct task_struct *task) #endif /* SPLIT_RSS_COUNTING */ +#ifdef CONFIG_FINEGRAINED_THP +void thp_print_cont_pte_table(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, unsigned long line); +#endif /* CONFIG_FINEGRAINED_THP */ + /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. @@ -730,6 +750,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ make_migration_entry_read(&entry); pte = swp_entry_to_pte(entry); + pte = arch_pte_clearhuge(pte); if (pte_swp_soft_dirty(*src_pte)) pte = pte_swp_mksoft_dirty(pte); if (pte_swp_uffd_wp(*src_pte)) @@ -763,11 +784,13 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, is_cow_mapping(vm_flags)) { make_device_private_entry_read(&entry); pte = swp_entry_to_pte(entry); + pte = arch_pte_clearhuge(pte); if (pte_swp_uffd_wp(*src_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } } + pte = arch_pte_clearhuge(pte); set_pte_at(dst_mm, addr, dst_pte, pte); return 0; } @@ -860,6 +883,11 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, page = vm_normal_page(src_vma, addr, pte); if (page) { int retval; + /* + * when 64KB hugepage map is copied, + * clear contiguous bit + */ + pte = arch_pte_clearhuge(pte); retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, addr, rss, prealloc, pte, page); @@ -887,7 +915,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, if (vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); - + pte = arch_pte_clearhuge(pte); /* * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA * does not have the VM_UFFD_WP, which means that the uffd @@ -965,6 +993,7 @@ again: progress++; continue; } + if (unlikely(!pte_present(*src_pte))) { entry.val = copy_nonpresent_pte(dst_mm, src_mm, dst_pte, src_pte, @@ -974,6 +1003,7 @@ again: progress += 8; continue; } + /* copy_present_pte() will clear `*prealloc' if consumed */ ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, addr, rss, &prealloc); @@ -1123,6 +1153,21 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, return 0; } +#ifdef CONFIG_FINEGRAINED_THP +bool zap_cont_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, pte_t **ptep, unsigned long *addr, + unsigned long end, struct page *page, + int *rss, spinlock_t *ptl); +#else /* CONFIG_FINEGRAINED_THP */ +bool zap_cont_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, pte_t **ptep, unsigned long *addr, + unsigned long end, struct page *page, + int *rss, spinlock_t *ptl) +{ + return false; +} +#endif /* CONFIG_FINEGRAINED_THP */ + int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { @@ -1245,6 +1290,16 @@ again: details->check_mapping != page_rmapping(page)) continue; } +#ifdef CONFIG_FINEGRAINED_THP + if (page && pte_cont(ptent) && PageTransHuge(compound_head(page))) { + if (zap_cont_pte_range(tlb, vma, pmd, &pte, + &addr, end, page, rss, ptl)) { + force_flush = 1; + break; + } + } else if (pte_cont(ptent)) + atomic_long_dec(&nr_phys_cont_pte_pages); +#endif /* CONFIG_FINEGRAINED_THP */ ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); @@ -2156,16 +2211,26 @@ vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, } EXPORT_SYMBOL(vmf_insert_mixed_mkwrite); + /* * maps a range of physical memory into the requested pages. the old * mappings are removed. any references to nonexistent pages results * in null mappings (currently treated as "copy-on-access") */ +#ifdef CONFIG_FINEGRAINED_THP +static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + return arch_remap_pte_range(mm, pmd, addr, end, pfn, prot); +} +#else /* CONFIG_FINEGRAINED_THP */ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pte_t *pte, *mapped_pte; + unsigned long next; spinlock_t *ptl; int err = 0; @@ -2179,13 +2244,50 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, err = -EACCES; break; } + + next = pte_cont_addr_end(addr, end); set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; - } while (pte++, addr += PAGE_SIZE, addr != end); + pte++; + addr += PAGE_SIZE; + } while (addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(mapped_pte, ptl); return err; } +#endif /* CONFIG_FINEGRAINED_THP */ + +static int remap_try_huge_pmd(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, + unsigned long end, unsigned long pfn, + pgprot_t prot) +{ + phys_addr_t phys_addr = __pfn_to_phys(pfn); + spinlock_t *ptl; + int ret; + + if ((end - addr) != PMD_SIZE) + return 0; + + if (!IS_ALIGNED(addr, PMD_SIZE)) + return 0; + + if (!IS_ALIGNED(phys_addr, PMD_SIZE)) + return 0; + + /* fixme - is this correct? */ + if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) { + pr_info("%s %d - freed pmd page??\n", __func__, __LINE__); + return 0; + } + + ptl = pmd_lock(mm, pmd); + ret = pmd_set_huge(pmd, phys_addr, prot); + spin_unlock(ptl); + + atomic_long_inc(&nr_phys_huge_pmd_pages); + + return ret; +} static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, @@ -2202,6 +2304,11 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); + + if (remap_try_huge_pmd(mm, pmd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot)) + continue; + err = remap_pte_range(mm, pmd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) @@ -3480,6 +3587,8 @@ out_release: return ret; } +extern bool eager_allocation; + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -3538,6 +3647,22 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma))) goto oom; +#ifdef CONFIG_FINEGRAINED_THP +#ifndef CONFIG_THP_CONSERVATIVE + /* + * 64KB hugepage creation on page fault is only allowed + * in an aggressive policy or a near-conservative policy + */ + if (__transparent_hugepage_enabled(vma)) { + ret = arch_do_huge_pte_anonymous_page(vmf); + if (!(ret & VM_FAULT_FALLBACK)) { + return ret; + } + ret = 0; + } +#endif /* CONFIG_THP_CONSERVATIVE */ +#endif /* CONFIG_FINEGRAINED_THP */ + page = alloc_zeroed_user_highpage_movable(vma, vmf->address); if (!page) goto oom; @@ -3786,6 +3911,14 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) BUILD_BUG(); return 0; } + +#ifdef CONFIG_FINEGRAINED_THP +static vm_fault_t arch_do_set_huge_pte(struct vm_fault *vmf, struct page *page) +{ + BUILD_BUG(); + return 0; +} +#endif #endif /** @@ -3810,12 +3943,23 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) pte_t entry; vm_fault_t ret; - if (pmd_none(*vmf->pmd) && PageTransCompound(page)) { + if (pmd_none(*vmf->pmd) && PageTransCompound(page) && + compound_nr(compound_head(page)) == HPAGE_PMD_NR) { ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) return ret; } +#ifdef CONFIG_FINEGRAINED_THP + /* PageTransHuge cannot find hugepage if the page is not a head */ + if (PageTransCompound(page) && + compound_nr(compound_head(page)) == HPAGE_CONT_PTE_NR) { + ret = arch_do_set_huge_pte(vmf, page); + if (ret != VM_FAULT_FALLBACK) + return ret; + } +#endif /* CONFIG_FINEGRAINED_THP */ + if (!vmf->pte) { ret = pte_alloc_one_map(vmf); if (ret) @@ -3827,7 +3971,11 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) update_mmu_tlb(vma, vmf->address, vmf->pte); return VM_FAULT_NOPAGE; } - + /* + if (!strcmp(current->comm, "org.tizen.nlp.s") || !strcmp(current->comm, "memps")) + pr_info("THP-wp: huge fault for addr (%lx) (%s) %s\n", + vmf->address, current->comm, __func__); + */ flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); @@ -4056,7 +4204,6 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) goto uncharge_out; if (ret & VM_FAULT_DONE_COW) return ret; - copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); __SetPageUptodate(vmf->cow_page); @@ -4269,10 +4416,37 @@ out: return 0; } +#ifdef CONFIG_FINEGRAINED_THP +static inline vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) +{ + //struct timespec64 ts, te, diff; + int ret; + +#ifdef CONFIG_FINEGRAINED_THP + return VM_FAULT_FALLBACK; +#endif + + //ktime_get_ts64(&ts); + ret = do_huge_pmd_anonymous_page(vmf); + /* + ktime_get_ts64(&te); + diff = timespec64_sub(te, ts); + if (!(ret & VM_FAULT_FALLBACK)) + pr_info("THP-fault: 2MB hugepage takes %lu nsecs\n", + timespec64_to_ns(&diff)); + */ + return ret; +} +#endif /* CONFIG_FINEGRAINED_THP */ + static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) { if (vma_is_anonymous(vmf->vma)) +#ifdef CONFIG_FINEGRAINED_THP + return __do_huge_pmd_anonymous_page(vmf); +#else return do_huge_pmd_anonymous_page(vmf); +#endif if (vmf->vma->vm_ops->huge_fault) return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); return VM_FAULT_FALLBACK; @@ -4299,6 +4473,10 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) return VM_FAULT_FALLBACK; } +#ifdef CONFIG_FINEGRAINED_THP +vm_fault_t wp_huge_pte(struct vm_fault *vmf, pte_t orig_pte); +#endif /* CONFIG_FINEGRAINED_THP */ + static vm_fault_t create_huge_pud(struct vm_fault *vmf) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ @@ -4407,8 +4585,15 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) goto unlock; } if (vmf->flags & FAULT_FLAG_WRITE) { - if (!pte_write(entry)) + if (!pte_write(entry)) { + int ret = arch_do_wp_page(vmf, entry); + + if (!(ret & VM_FAULT_FALLBACK)) + return ret; return do_wp_page(vmf); + } + if (arch_huge_pte_set_accessed(vmf, entry)) + goto unlock; entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); diff --git a/mm/migrate.c b/mm/migrate.c index ba56339..b16e340 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -266,6 +266,16 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, page_dup_rmap(new, true); } else #endif +#ifdef CONFIG_FINEGRAINED_THP + if (PageTransHuge(new)) { + pte = pte_mkcont(pte_mkhuge(pte)); + arch_set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte, 0); + if (PageAnon(new)) + page_add_anon_rmap(new, vma, pvmw.address, true); + else + page_dup_rmap(new, true); + } else +#endif /* CONFIG_FINEGRAINED_THP */ { set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); diff --git a/mm/mmap.c b/mm/mmap.c index 5c8b448..02eb014 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -52,6 +52,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -271,6 +272,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) success: populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; + if (newbrk > oldbrk) + khugepaged_mem_hook(mm, origbrk, newbrk - oldbrk, __func__); if (downgraded) mmap_read_unlock(mm); else @@ -1445,6 +1448,12 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (mm->map_count > sysctl_max_map_count) return -ENOMEM; +#ifdef CONFIG_FINEGRAINED_THP + if ((len >> PAGE_SHIFT) >= HPAGE_CONT_PTE_NR && + file && addr == 0) + flags |= MAP_FILE_THP; +#endif + /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ @@ -1867,6 +1876,12 @@ unmap_writable: allow_write_access(file); } file = vma->vm_file; + if (file && (vm_flags & VM_DENYWRITE)) + /* read-only file pages */ + khugepaged_mem_hook(mm, addr, len, __func__); + else if (!file && !vma->vm_ops) + /* anonymous pages */ + khugepaged_mem_hook(mm, addr, len, __func__); out: perf_event_mmap(vma); @@ -2190,6 +2205,19 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, info.high_limit = mmap_end; info.align_mask = 0; info.align_offset = 0; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (!addr && len >= HPAGE_PMD_SIZE) { + info.align_mask = HPAGE_PMD_SIZE - 1; + info.align_offset = HPAGE_PMD_SIZE; +#ifdef CONFIG_FINEGRAINED_THP + } else if (!addr && len >= HPAGE_CONT_PTE_SIZE) { + info.align_mask = HPAGE_CONT_PTE_SIZE - 1; + info.align_offset = HPAGE_CONT_PTE_SIZE; +#endif + } +#endif + return vm_unmapped_area(&info); } #endif @@ -2232,6 +2260,19 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); info.align_mask = 0; info.align_offset = 0; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (!addr && len >= HPAGE_PMD_SIZE) { + info.align_mask = HPAGE_PMD_SIZE - 1; + info.align_offset = HPAGE_PMD_SIZE; +#ifdef CONFIG_FINEGRAINED_THP + } else if (!addr && len >= HPAGE_CONT_PTE_SIZE) { + info.align_mask = HPAGE_CONT_PTE_SIZE - 1; + info.align_offset = HPAGE_CONT_PTE_SIZE; +#endif + } +#endif + addr = vm_unmapped_area(&info); /* diff --git a/mm/mprotect.c b/mm/mprotect.c index 56c02be..956745f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -77,6 +77,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_t ptent; bool preserve_write = prot_numa && pte_write(oldpte); +#ifdef CONFIG_FINEGRAINED_THP + if (pte_cont(oldpte)) { + spin_unlock(ptl); + __split_huge_pte(vma, pmd, pte, addr, false, NULL); + spin_lock(ptl); + } +#endif /* CONFIG_FINEGRAINED_THP */ /* * Avoid trapping faults against the zero or KSM * pages. See similar comment in change_huge_pmd. diff --git a/mm/mremap.c b/mm/mremap.c index 138abba..dc23cef 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -161,6 +161,17 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, if (pte_none(*old_pte)) continue; +#ifdef CONFIG_FINEGRAINED_THP + if (pte_cont(*old_pte)) { + /* + * Contiguous ptes will be moved, + * and we cannot ensure their alignment. + * So, simply split them. + */ + split_huge_pte_address(vma, old_addr, false, NULL); + } +#endif /* CONFIG_FINEGRAINED_THP */ + pte = ptep_get_and_clear(mm, old_addr, old_pte); /* * If we are remapping a valid PTE, make sure diff --git a/mm/rmap.c b/mm/rmap.c index 6657000..64de8c1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1138,7 +1138,16 @@ void do_page_add_anon_rmap(struct page *page, * disabled. */ if (compound) +#ifdef CONFIG_FINEGRAINED_THP + { + if (nr == HPAGE_PMD_NR) + __inc_lruvec_page_state(page, NR_ANON_THPS); + else + __inc_lruvec_page_state(page, NR_ANON_64KB_THPS); + } +#else /* CONFIG_FINEGRAINED_THP */ __inc_lruvec_page_state(page, NR_ANON_THPS); +#endif /* CONFIG_FINEGRAINED_THP */ __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); } @@ -1179,8 +1188,14 @@ void page_add_new_anon_rmap(struct page *page, atomic_set(compound_mapcount_ptr(page), 0); if (hpage_pincount_available(page)) atomic_set(compound_pincount_ptr(page), 0); - +#ifdef CONFIG_FINEGRAINED_THP + if (nr == HPAGE_PMD_NR) + __inc_lruvec_page_state(page, NR_ANON_THPS); + else + __inc_lruvec_page_state(page, NR_ANON_64KB_THPS); +#else /* CONFIG_FINEGRAINED_THP */ __inc_lruvec_page_state(page, NR_ANON_THPS); +#endif /* CONFIG_FINEGRAINED_THP */ } else { /* Anon THP always mapped first with PMD */ VM_BUG_ON_PAGE(PageTransCompound(page), page); @@ -1212,9 +1227,19 @@ void page_add_file_rmap(struct page *page, bool compound) if (!atomic_inc_and_test(compound_mapcount_ptr(page))) goto out; if (PageSwapBacked(page)) +#ifdef CONFIG_FINEGRAINED_THP + __inc_node_page_state(page, nr == HPAGE_PMD_NR ? + NR_SHMEM_PMDMAPPED : NR_SHMEM_PTEMAPPED); +#else __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); +#endif else +#ifdef CONFIG_FINEGRAINED_THP + __inc_node_page_state(page, nr == HPAGE_PMD_NR ? + NR_FILE_PMDMAPPED : NR_FILE_PTEMAPPED); +#else __inc_node_page_state(page, NR_FILE_PMDMAPPED); +#endif } else { if (PageTransCompound(page) && page_mapping(page)) { VM_WARN_ON_ONCE(!PageLocked(page)); @@ -1253,9 +1278,19 @@ static void page_remove_file_rmap(struct page *page, bool compound) if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) return; if (PageSwapBacked(page)) +#ifdef CONFIG_FINEGRAINED_THP + __dec_node_page_state(page, nr == HPAGE_PMD_NR ? + NR_SHMEM_PMDMAPPED : NR_SHMEM_PTEMAPPED); +#else __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); +#endif else +#ifdef CONFIG_FINEGRAINED_THP + __dec_node_page_state(page, nr == HPAGE_PMD_NR ? + NR_FILE_PMDMAPPED : NR_FILE_PTEMAPPED); +#else __dec_node_page_state(page, NR_FILE_PMDMAPPED); +#endif } else { if (!atomic_add_negative(-1, &page->_mapcount)) return; @@ -1286,7 +1321,14 @@ static void page_remove_anon_compound_rmap(struct page *page) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return; +#ifdef CONFIG_FINEGRAINED_THP + if (thp_nr_pages(page) == HPAGE_PMD_NR) + __dec_lruvec_page_state(page, NR_ANON_THPS); + else + __dec_lruvec_page_state(page, NR_ANON_64KB_THPS); +#else /* CONFIG_FINEGRAINED_THP */ __dec_lruvec_page_state(page, NR_ANON_THPS); +#endif /* CONFIG_FINEGRAINED_THP */ if (TestClearPageDoubleMap(page)) { /* @@ -1348,8 +1390,12 @@ void page_remove_rmap(struct page *page, bool compound) */ __dec_lruvec_page_state(page, NR_ANON_MAPPED); - if (unlikely(PageMlocked(page))) - clear_page_mlock(page); + if (unlikely(PageMlocked(page))) { + if (unlikely(PageTransCompound(page))) + clear_page_mlock(compound_head(page)); + else + clear_page_mlock(page); + } if (PageTransCompound(page)) deferred_split_huge_page(compound_head(page)); @@ -1398,6 +1444,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, flags & TTU_SPLIT_FREEZE, page); } +#ifdef CONFIG_FINEGRAINED_THP + if (flags & TTU_SPLIT_HUGE_PTE) + split_huge_pte_address(vma, address, + flags & TTU_SPLIT_FREEZE, page); +#endif + /* * For THP, we have to assume the worse case ie pmd for invalidation. * For hugetlb, it could be much worse if we need to do pud @@ -1466,6 +1518,33 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * do this outside rmap routines. */ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); +#ifdef CONFIG_FINEGRAINED_THP + if (thp_nr_pages(page) == HPAGE_PMD_NR && + huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { + /* + * huge_pmd_unshare unmapped an entire PMD + * page. There is no way of knowing exactly + * which PMDs may be cached for this mm, so + * we must flush them all. start/end were + * already adjusted above to cover this range. + */ + flush_cache_range(vma, range.start, range.end); + flush_tlb_range(vma, range.start, range.end); + mmu_notifier_invalidate_range(mm, range.start, + range.end); + /* + * The ref count of the PMD page was dropped + * which is part of the way map counting + * is done for shared PMDs. Return 'true' + * here. When there is no other sharing, + * huge_pmd_unshare returns false and we will + * unmap the actual page and drop map count + * to zero. + */ + page_vma_mapped_walk_done(&pvmw); + break; + } +#else /* CONFIG_FINEGRAINED_THP */ if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { /* * huge_pmd_unshare unmapped an entire PMD @@ -1491,6 +1570,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, page_vma_mapped_walk_done(&pvmw); break; } +#endif /* CONFIG_FINEGRAINED_THP */ } if (IS_ENABLED(CONFIG_MIGRATION) && diff --git a/mm/shmem.c b/mm/shmem.c index 537c137..01c9b74 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -884,9 +884,15 @@ static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end) return true; /* Just proceed to delete a huge page wholly within the range punched */ +#ifdef CONFIG_FINEGRAINED_THP + if (PageHead(page) && + page->index >= start && page->index + thp_nr_pages(page) <= end) + return true; +#else if (PageHead(page) && page->index >= start && page->index + HPAGE_PMD_NR <= end) return true; +#endif /* CONFIG_FINEGRAINED_THP */ /* Try to split huge page, so we can truly punch the hole or truncate */ return split_huge_page(page) >= 0; @@ -1035,9 +1041,15 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, clear_highpage(page); flush_dcache_page(page); set_page_dirty(page); +#ifdef CONFIG_FINEGRAINED_THP + if (index < + round_up(start, thp_nr_pages(page))) + start = index + 1; +#else /* CONFIG_FINEGRAINED_THP */ if (index < round_up(start, HPAGE_PMD_NR)) start = index + 1; +#endif /* CONFIG_FINEGRAINED_THP */ } } unlock_page(page); @@ -1531,22 +1543,40 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, return page; } +#ifdef CONFIG_FINEGRAINED_THP +static struct page *shmem_alloc_hugepage(gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index, int page_nr) +#else /* CONFIG_FINEGRAINED_THP */ static struct page *shmem_alloc_hugepage(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) +#endif/* CONFIG_FINEGRAINED_THP */ { struct vm_area_struct pvma; struct address_space *mapping = info->vfs_inode.i_mapping; pgoff_t hindex; struct page *page; +#ifdef CONFIG_FINEGRAINED_THP + hindex = round_down(index, page_nr); + if (xa_find(&mapping->i_pages, &hindex, hindex + page_nr - 1, + XA_PRESENT)) + return NULL; +#else /* CONFIG_FINEGRAINED_THP */ hindex = round_down(index, HPAGE_PMD_NR); if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, XA_PRESENT)) return NULL; +#endif /* CONFIG_FINEGRAINED_THP */ shmem_pseudo_vma_init(&pvma, info, hindex); +#ifdef CONFIG_FINEGRAINED_THP + page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, + page_nr == HPAGE_PMD_NR ? HPAGE_PMD_ORDER : HPAGE_CONT_PTE_ORDER, + &pvma, 0, numa_node_id(), true); +#else /* CONFIG_FINEGRAINED_THP */ page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); +#endif /* CONFIG_FINEGRAINED_THP */ shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); @@ -1568,9 +1598,15 @@ static struct page *shmem_alloc_page(gfp_t gfp, return page; } +#ifdef CONFIG_FINEGRAINED_THP +static struct page *shmem_alloc_and_acct_page(gfp_t gfp, + struct inode *inode, + pgoff_t index, bool huge, int page_nr) +#else /* CONFIG_FINEGRAINED_THP */ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, struct inode *inode, pgoff_t index, bool huge) +#endif /* CONFIG_FINEGRAINED_THP */ { struct shmem_inode_info *info = SHMEM_I(inode); struct page *page; @@ -1579,13 +1615,21 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) huge = false; +#ifdef CONFIG_FINEGRAINED_THP + nr = huge ? page_nr : 1; +#else nr = huge ? HPAGE_PMD_NR : 1; +#endif if (!shmem_inode_acct_block(inode, nr)) goto failed; if (huge) +#ifdef CONFIG_FINEGRAINED_THP + page = shmem_alloc_hugepage(gfp, info, index, nr); +#else page = shmem_alloc_hugepage(gfp, info, index); +#endif else page = shmem_alloc_page(gfp, info, index); if (page) { @@ -1805,6 +1849,9 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, int error; int once = 0; int alloced = 0; +#ifdef CONFIG_FINEGRAINED_THP + int nr_pages = HPAGE_PMD_NR; +#endif if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; @@ -1835,6 +1882,11 @@ repeat: if (page && sgp == SGP_WRITE) mark_page_accessed(page); +#ifdef CONFIG_FINEGRAINED_THP + if (page) + nr_pages = thp_nr_pages(page); +#endif + /* fallocated page? */ if (page && !PageUptodate(page)) { if (sgp != SGP_READ) @@ -1870,12 +1922,21 @@ repeat: case SHMEM_HUGE_WITHIN_SIZE: { loff_t i_size; pgoff_t off; - +#ifdef CONFIG_FINEGRAINED_THP + off = round_up(index, nr_pages); +#else off = round_up(index, HPAGE_PMD_NR); +#endif i_size = round_up(i_size_read(inode), PAGE_SIZE); +#ifdef CONFIG_FINEGRAINED_THP + if (i_size >= nr_pages * PAGE_SIZE && + i_size >> PAGE_SHIFT >= off) + goto alloc_huge; +#else if (i_size >= HPAGE_PMD_SIZE && i_size >> PAGE_SHIFT >= off) goto alloc_huge; +#endif fallthrough; } @@ -1887,11 +1948,20 @@ repeat: } alloc_huge: +#ifdef CONFIG_FINEGRAINED_THP + page = shmem_alloc_and_acct_page(gfp, inode, index, true, nr_pages); +#else page = shmem_alloc_and_acct_page(gfp, inode, index, true); +#endif if (IS_ERR(page)) { alloc_nohuge: +#ifdef CONFIG_FINEGRAINED_THP + page = shmem_alloc_and_acct_page(gfp, inode, + index, false, 1); +#else page = shmem_alloc_and_acct_page(gfp, inode, index, false); +#endif } if (IS_ERR(page)) { int retry = 5; @@ -1917,7 +1987,11 @@ alloc_nohuge: } if (PageTransHuge(page)) +#ifdef CONFIG_FINEGRAINED_THP + hindex = round_down(index, nr_pages); +#else hindex = round_down(index, HPAGE_PMD_NR); +#endif else hindex = index; @@ -1938,6 +2012,27 @@ alloc_nohuge: spin_unlock_irq(&info->lock); alloced = true; +#ifdef CONFIG_FINEGRAINED_THP + if (PageTransHuge(page) && + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < + hindex + nr_pages - 1) { + /* + * Part of the huge page is beyond i_size: subject + * to shrink under memory pressure. + */ + spin_lock(&sbinfo->shrinklist_lock); + /* + * _careful to defend against unlocked access to + * ->shrink_list in shmem_unused_huge_shrink() + */ + if (list_empty_careful(&info->shrinklist)) { + list_add_tail(&info->shrinklist, + &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + } + spin_unlock(&sbinfo->shrinklist_lock); + } +#else /* CONFIG_FINEGRAINED_THP */ if (PageTransHuge(page) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < hindex + HPAGE_PMD_NR - 1) { @@ -1957,7 +2052,7 @@ alloc_nohuge: } spin_unlock(&sbinfo->shrinklist_lock); } - +#endif /* CONFIG_FINEGRAINED_THP */ /* * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. */ @@ -2547,13 +2642,21 @@ shmem_write_end(struct file *file, struct address_space *mapping, struct page *head = compound_head(page); if (PageTransCompound(page)) { int i; - +#ifdef CONFIG_FINEGRAINED_THP + for (i = 0; i < thp_nr_pages(page); i++) { + if (head + i == page) + continue; + clear_highpage(head + i); + flush_dcache_page(head + i); + } +#else /* CONFIG_FINEGRAINED_THP */ for (i = 0; i < HPAGE_PMD_NR; i++) { if (head + i == page) continue; clear_highpage(head + i); flush_dcache_page(head + i); } +#endif /* CONFIG_FINEGRAINED_THP */ } if (copied < PAGE_SIZE) { unsigned from = pos & (PAGE_SIZE - 1); @@ -4102,6 +4205,12 @@ bool shmem_huge_enabled(struct vm_area_struct *vma) if (i_size >= HPAGE_PMD_SIZE && i_size >> PAGE_SHIFT >= off) return true; +#ifdef CONFIG_FINEGRAINED_THP + off = round_up(vma->vm_pgoff, HPAGE_CONT_PTE_NR); + if (i_size >= HPAGE_CONT_PTE_SIZE && + i_size >> PAGE_SHIFT >= off) + return true; +#endif /* CONFIG_FINEGRAINED_THP */ fallthrough; case SHMEM_HUGE_ADVISE: /* TODO: implement fadvise() hints */ diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 0357fbe..fed073f 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -312,7 +312,11 @@ swp_entry_t get_swap_page(struct page *page) if (PageTransHuge(page)) { if (IS_ENABLED(CONFIG_THP_SWAP)) +#ifdef CONFIG_FINEGRAINED_THP + get_swap_pages(1, &entry, thp_nr_pages(page)); +#else get_swap_pages(1, &entry, HPAGE_PMD_NR); +#endif goto out; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 5256c10..310e06a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1673,7 +1673,12 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, } if (map) ci = lock_cluster(si, offset); - for (i = 0; i < HPAGE_PMD_NR; i++) { +#ifdef CONFIG_FINEGRAINED_THP + for (i = 0; i < thp_nr_pages(page); i++) +#else + for (i = 0; i < HPAGE_PMD_NR; i++) +#endif + { mapcount = atomic_read(&page[i]._mapcount) + 1; _total_mapcount += mapcount; if (map) { @@ -1685,7 +1690,11 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, unlock_cluster(ci); if (PageDoubleMap(page)) { map_swapcount -= 1; +#ifdef CONFIG_FINEGRAINED_THP + _total_mapcount -= thp_nr_pages(page); +#else _total_mapcount -= HPAGE_PMD_NR; +#endif } mapcount = compound_mapcount(page); map_swapcount += mapcount; diff --git a/mm/truncate.c b/mm/truncate.c index 960edf5..c981ef5 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -566,8 +566,13 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, unlock_page(page); continue; } else if (PageTransHuge(page)) { +#ifdef CONFIG_FINEGRAINED_THP + index += thp_nr_pages(page) - 1; + i += thp_nr_pages(page) - 1; +#else /* CONFIG_FINEGRAINED_THP */ index += HPAGE_PMD_NR - 1; i += HPAGE_PMD_NR - 1; +#endif /* CONFIG_FINEGRAINED_THP */ /* * 'end' is in the middle of THP. Don't * invalidate the page as the part outside of diff --git a/mm/vmscan.c b/mm/vmscan.c index 67d3833..5c23848 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1302,7 +1302,16 @@ static unsigned int shrink_page_list(struct list_head *page_list, bool was_swapbacked = PageSwapBacked(page); if (unlikely(PageTransHuge(page))) +#ifdef CONFIG_FINEGRAINED_THP + { + if (nr_pages == HPAGE_PMD_NR) + flags |= TTU_SPLIT_HUGE_PMD; + else + flags |= TTU_SPLIT_HUGE_PTE; + } +#else /* CONFIG_FINEGRAINED_THP */ flags |= TTU_SPLIT_HUGE_PMD; +#endif /* CONFIG_FINEGRAINED_THP */ if (!try_to_unmap(page, flags)) { stat->nr_unmap_fail += nr_pages; diff --git a/mm/vmstat.c b/mm/vmstat.c index 2cf6681..42f5ef2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1201,9 +1201,19 @@ const char * const vmstat_text[] = { "nr_shmem", "nr_shmem_hugepages", "nr_shmem_pmdmapped", +#ifdef CONFIG_FINEGRAINED_THP + "nr_shmem_ptemapped", + "nr_file_64kb_hugepages", +#endif "nr_file_hugepages", "nr_file_pmdmapped", +#ifdef CONFIG_FINEGRAINED_THP + "nr_file_ptemapped", +#endif "nr_anon_transparent_hugepages", +#ifdef CONFIG_FINEGRAINED_THP + "nr_anon_64KB_transparent_hugepages", +#endif "nr_vmscan_write", "nr_vmscan_immediate_reclaim", "nr_dirtied", @@ -1323,6 +1333,9 @@ const char * const vmstat_text[] = { "thp_split_page", "thp_split_page_failed", "thp_deferred_split_page", +#ifdef CONFIG_FINEGRAINED_THP + "thp_split_cont_pte", +#endif "thp_split_pmd", #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD "thp_split_pud", -- 2.7.4 From a1671fa471bb8d9021718708e0a974ff80dc78b2 Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Mon, 30 Aug 2021 15:15:54 +0900 Subject: [PATCH 06/16] arm64: defconfig: enable default THP configurations Enable default THP in Linux. With this configuration, the Linux kernel can make 2MB hugepages without user intervention. Change-Id: Ifdfc472a78edf76be32c359b2b15c0ca28a2bc8b Signed-off-by: Sung-hun Kim --- arch/arm64/configs/tizen_bcm2711_defconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/configs/tizen_bcm2711_defconfig b/arch/arm64/configs/tizen_bcm2711_defconfig index a68bf74..8365e9a 100644 --- a/arch/arm64/configs/tizen_bcm2711_defconfig +++ b/arch/arm64/configs/tizen_bcm2711_defconfig @@ -63,10 +63,12 @@ CONFIG_MAC_PARTITION=y CONFIG_BINFMT_MISC=m CONFIG_LKSM=y CONFIG_LKSM_FILTER=y +CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_CLEANCACHE=y CONFIG_FRONTSWAP=y CONFIG_CMA=y CONFIG_ZSMALLOC=y +CONFIG_READ_ONLY_THP_FOR_FS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y -- 2.7.4 From 3dc6e6105cfb86df542adab579ff8ebcc10e462b Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Mon, 30 Aug 2021 15:16:52 +0900 Subject: [PATCH 07/16] arm64: defconfig: enable finegrained-THP configuration Enable finegrained-THP to allow creation of 64KB hugepages. Change-Id: If23d2489571ac0a5c367db54126b7b6abf42eebc Signed-off-by: Sung-hun Kim --- arch/arm64/configs/tizen_bcm2711_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/configs/tizen_bcm2711_defconfig b/arch/arm64/configs/tizen_bcm2711_defconfig index 8365e9a..9a1e6e2 100644 --- a/arch/arm64/configs/tizen_bcm2711_defconfig +++ b/arch/arm64/configs/tizen_bcm2711_defconfig @@ -69,6 +69,7 @@ CONFIG_FRONTSWAP=y CONFIG_CMA=y CONFIG_ZSMALLOC=y CONFIG_READ_ONLY_THP_FOR_FS=y +CONFIG_FINEGRAINED_THP=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y -- 2.7.4 From 6a0f555f06b0ce2e992ce6f00a51da6c94ac7d39 Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Wed, 8 Sep 2021 15:58:07 +0900 Subject: [PATCH 08/16] uapi: THP: remove possible leak of CONFIG_FINEGRAINED_THP to user-space CONFIG_FINEGRAINED_THP is presented in a header file (asm-generic/mman-common.h) of uapi. It generates a build error for headers with leak CONFIG to user-space message. This patch remove a possible leak of the kernel configuration. Change-Id: I16173eaf5094cc07312f10fb33a22dd73d67ff88 Fixes: 7d5372737d34 ("mm: THP: introducing a fine-grained transparent hugepage technique for ARM64 architecture") Signed-off-by: Sung-hun Kim --- include/uapi/asm-generic/mman-common.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index f5d33b8..2073cb1 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -30,9 +30,7 @@ #define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ -#ifdef CONFIG_FINEGRAINED_THP #define MAP_FILE_THP 0x200000 /* MAP_FIXED which doesn't unmap underlying mapping */ -#endif #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ -- 2.7.4 From 67144542252f40de5991a6a2bf171d2a174b4ead Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Wed, 8 Sep 2021 16:59:12 +0900 Subject: [PATCH 09/16] mm: THP: meminfo: modify areas of kernel configurations This commit modifies coverages of ifdef macros of CONFIG_TRANSPARENT_HUGEPAGE and CONFIG_FINEGRAINED_THP to build properly by removing dependency problems. Fixes: 7d5372737d34 ("mm: THP: introducing a fine-grained transparent hugepage technique for ARM64 architecture") Change-Id: Id5e692eb2f89a0f93c696e9c20339940f7107874 Signed-off-by: Sung-hun Kim --- fs/proc/meminfo.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 9a782664..014f197 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -151,13 +151,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_FINEGRAINED_THP show_val_kb(m, "FileCPteMapped: ", global_node_page_state(NR_FILE_PTEMAPPED) * HPAGE_CONT_PTE_NR); -#endif /* CONFIG_FINEGRAINED_THP */ -#endif show_val_kb(m, "PhysCPteMapped: ", phys_cont_pte_pages()); +#endif /* CONFIG_FINEGRAINED_THP */ show_val_kb(m, "PhysPmdMapped: ", phys_huge_pmd_pages() * HPAGE_PMD_NR); - +#endif #ifdef CONFIG_CMA show_val_kb(m, "CmaTotal: ", totalcma_pages); show_val_kb(m, "CmaFree: ", -- 2.7.4 From 634e69909cae9133097ea11751eaa15c6ff6ff78 Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Wed, 8 Sep 2021 17:05:14 +0900 Subject: [PATCH 10/16] mm: THP: memory: remove unnecessary function calls The called function is only used for finegrained-THP, so remove it for unconfigured cases. Fixes: 7d5372737d34 ("mm: THP: introducing a fine-grained transparent hugepage technique for ARM64 architecture") Change-Id: I138b135980acf79e695731f4d42399ca203c4ca6 Signed-off-by: Sung-hun Kim --- mm/memory.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 085287f..08336046 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2230,7 +2230,6 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long pfn, pgprot_t prot) { pte_t *pte, *mapped_pte; - unsigned long next; spinlock_t *ptl; int err = 0; @@ -2245,7 +2244,6 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, break; } - next = pte_cont_addr_end(addr, end); set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; pte++; -- 2.7.4 From f17e2a879df716db843d744ecf8782d1bcaf13c6 Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Wed, 8 Sep 2021 17:07:36 +0900 Subject: [PATCH 11/16] asm-generic: THP: fix bugs in asm-generic headers Since a function is defined in two headers at the same time, the kernel build is failed. And, a vm_fault_t-typed function does not return any values. This patch fixes two bugs in asm-generic headers. Fixes: 7d5372737d34 ("mm: THP: introducing a fine-grained transparent hugepage technique for ARM64 architecture") Change-Id: I84110bbb6c7f5b0794c55b3aca98419f12469eca Signed-off-by: Sung-hun Kim --- include/asm-generic/finegrained_thp.h | 1 + include/asm-generic/huge_mm.h | 8 +++----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/asm-generic/finegrained_thp.h b/include/asm-generic/finegrained_thp.h index 08a3461..35c9031 100644 --- a/include/asm-generic/finegrained_thp.h +++ b/include/asm-generic/finegrained_thp.h @@ -1,6 +1,7 @@ /* a generic header for fine-grained thp */ #ifndef __ASM_FINEGRAINED_THP_H #define __ASM_FINEGRAINED_THP_H +#ifndef CONFIG_FINEGRAINED_THP static inline void khugepaged_mem_hook(struct mm_struct *mm, unsigned long addr, long diff, const char *debug) {} diff --git a/include/asm-generic/huge_mm.h b/include/asm-generic/huge_mm.h index 48527cf..6714d0e 100644 --- a/include/asm-generic/huge_mm.h +++ b/include/asm-generic/huge_mm.h @@ -24,13 +24,11 @@ static inline pte_t arch_make_huge_pte(struct page *hpage, return mk_pte(hpage, vma->vm_page_prot); } -static inline void khugepaged_mem_hook(struct mm_struct *mm, - unsigned long addr, long diff, const char *debug) -{} - static inline vm_fault_t arch_do_set_huge_pte(struct vm_fault *vmf, struct page *page) -{} +{ + return VM_FAULT_FALLBACK; +} static inline void arch_set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long headoff) -- 2.7.4 From 04519e317c60d1d245443b74ae0fef2af1d6241f Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Wed, 8 Sep 2021 17:18:37 +0900 Subject: [PATCH 12/16] mm: THP: workaround: only allow including specific headers for FINEGRAINED_THP configured cases asm/huge_mm.h and asm/finegrained_thp.h are only used for FINEGRAINED_THP-enabled kernel. Otherwise, such as arm which does not support contiguous PTE bit, disallow including them. Fixes: 7d5372737d34 ("mm: THP: introducing a fine-grained transparent hugepage technique for ARM64 architecture") Change-Id: I37c2bc46106711f4b7ee33a6838d87e929e13247 Signed-off-by: Sung-hun Kim --- mm/khugepaged.c | 2 ++ mm/memory.c | 5 +++++ mm/mmap.c | 4 ++++ 3 files changed, 11 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index aa96e8e..34f0c40 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -21,8 +21,10 @@ #include #include +#ifdef CONFIG_FINEGRAINED_THP #include #include +#endif #include "internal.h" enum scan_result { diff --git a/mm/memory.c b/mm/memory.c index 08336046..bdf18e9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -82,8 +82,13 @@ #include #include #include +#ifdef CONFIG_FINEGRAINED_THP #include #include +#else +#include +#include +#endif #include "pgalloc-track.h" #include "internal.h" diff --git a/mm/mmap.c b/mm/mmap.c index 02eb014..cca7268 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -52,7 +52,11 @@ #include #include #include +#ifdef CONFIG_FINEGRAINED_THP #include +#else +#include +#endif #define CREATE_TRACE_POINTS #include -- 2.7.4 From 77427aa27cc83043be034d102525002f50bbf05f Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Mon, 13 Sep 2021 12:19:39 +0900 Subject: [PATCH 13/16] mm: THP: workaround: fix a build error occurred if FINEGRAINED_THP is disabled Fixes: 04519e317c60 ('mm: THP: workaround: only allow including specific headers for FINEGRAINED_THP configured cases') Change-Id: Iec1678cb5c45708865a1d18fef88807e7fd47870 Signed-off-by: Sung-hun Kim --- mm/khugepaged.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 34f0c40..99cc150 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -24,6 +24,9 @@ #ifdef CONFIG_FINEGRAINED_THP #include #include +#else +#include +#include #endif #include "internal.h" -- 2.7.4 From 8690fa3fc22ac74304f26441798e540f8f929926 Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Wed, 15 Sep 2021 13:28:04 +0900 Subject: [PATCH 14/16] mm, meminfo: modify page counting Two counters, nr_phys_huge_pmd_pages and nr_phys_cont_pte_pages, are counted by different units. This patch enforces two counters counted by pages, not huge pages. Change-Id: I1fcb6a1a9c3a60c956b861e79ec3714a33004991 Signed-off-by: Sung-hun Kim --- fs/proc/meminfo.c | 2 +- mm/huge_memory.c | 4 ++-- mm/memory.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 014f197..abc072ba 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -155,7 +155,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) phys_cont_pte_pages()); #endif /* CONFIG_FINEGRAINED_THP */ show_val_kb(m, "PhysPmdMapped: ", - phys_huge_pmd_pages() * HPAGE_PMD_NR); + phys_huge_pmd_pages()); #endif #ifdef CONFIG_CMA show_val_kb(m, "CmaTotal: ", totalcma_pages); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 20ea663..23d21e5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1670,7 +1670,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (vma_is_special_huge(vma)) { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); - atomic_long_dec(&nr_phys_huge_pmd_pages); + atomic_long_add(-HPAGE_PMD_NR, &nr_phys_huge_pmd_pages); spin_unlock(ptl); if (is_huge_zero_pmd(orig_pmd)) tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); @@ -2281,7 +2281,7 @@ repeat: pmd_t orig_pmd; orig_pmd = pmdp_huge_get_and_clear_full(vma, haddr, pmd, 0); - atomic_long_dec(&nr_phys_huge_pmd_pages); + atomic_long_add(-HPAGE_PMD_NR, &nr_phys_huge_pmd_pages); thp_remap_pte_range_locked(mm, pmd, haddr, haddr + HPAGE_PMD_SIZE, pmd_pfn(orig_pmd), diff --git a/mm/memory.c b/mm/memory.c index bdf18e9..e6deee2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2287,7 +2287,7 @@ static int remap_try_huge_pmd(struct mm_struct *mm, pmd_t *pmd, unsigned long ad ret = pmd_set_huge(pmd, phys_addr, prot); spin_unlock(ptl); - atomic_long_inc(&nr_phys_huge_pmd_pages); + atomic_long_add(HPAGE_PMD_NR, &nr_phys_huge_pmd_pages); return ret; } -- 2.7.4 From 93cdd04abf4b2522392f658de1ed35a602c0e945 Mon Sep 17 00:00:00 2001 From: Seung-Woo Kim Date: Wed, 15 Sep 2021 15:07:00 +0900 Subject: [PATCH 15/16] Partially Revert "brcmfmac: p2p: Deal with set but unused variables" This partially reverts commit 2de64ca7c9fadd32b261530592db4a6adbfcb53f. The commit 61325dc073e2 ("Revert "brcmfmac: move configuration of probe request IEs"") requires vif set with p2p interface, but commit 2de64ca7c9fa removes setting. Partially revert the commit to support p2p usage with p2p interface. Change-Id: Ia90e256c3d10396b1018e3aec8145139accfb39e Reported-by: Jiung Yu Signed-off-by: Seung-Woo Kim --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c index b08d2ca..942bd53 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c @@ -912,6 +912,8 @@ int brcmf_p2p_scan_prep(struct wiphy *wiphy, if (err) return err; + vif = p2p->bss_idx[P2PAPI_BSSCFG_DEVICE].vif; + /* override .run_escan() callback. */ cfg->escan_info.run = brcmf_p2p_run_escan; } -- 2.7.4 From be97c7c0fb8de0bc2dfc7bf82bf02bcc11142ae0 Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Wed, 15 Sep 2021 16:39:13 +0900 Subject: [PATCH 16/16] mm, thp: hide remap_try_huge_pmd for the THP-disabled kernel Since remap_try_huge_pmd is dependent on the kernel configuration CONFIG_TRANSPARENT_HUGEPAGE, it should be hidden when the kernel configuration is disabled. Fixes: 8690fa3fc22a ('mm, meminfo: modify page counting') Change-Id: Iae9efb2edf6cd563c794af68bea7987110a5b2da Signed-off-by: Sung-hun Kim --- mm/memory.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index e6deee2..f1e5eb9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2259,7 +2259,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, return err; } #endif /* CONFIG_FINEGRAINED_THP */ - +#ifdef CONFIG_TRANSPARENT_HUGEPAGE static int remap_try_huge_pmd(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) @@ -2291,6 +2291,7 @@ static int remap_try_huge_pmd(struct mm_struct *mm, pmd_t *pmd, unsigned long ad return ret; } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, @@ -2308,10 +2309,11 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, do { next = pmd_addr_end(addr, end); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE if (remap_try_huge_pmd(mm, pmd, addr, next, pfn + (addr >> PAGE_SHIFT), prot)) continue; - +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ err = remap_pte_range(mm, pmd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) -- 2.7.4