From 04519e317c60d1d245443b74ae0fef2af1d6241f Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Wed, 8 Sep 2021 17:18:37 +0900 Subject: [PATCH 01/16] mm: THP: workaround: only allow including specific headers for FINEGRAINED_THP configured cases asm/huge_mm.h and asm/finegrained_thp.h are only used for FINEGRAINED_THP-enabled kernel. Otherwise, such as arm which does not support contiguous PTE bit, disallow including them. Fixes: 7d5372737d34 ("mm: THP: introducing a fine-grained transparent hugepage technique for ARM64 architecture") Change-Id: I37c2bc46106711f4b7ee33a6838d87e929e13247 Signed-off-by: Sung-hun Kim --- mm/khugepaged.c | 2 ++ mm/memory.c | 5 +++++ mm/mmap.c | 4 ++++ 3 files changed, 11 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index aa96e8e..34f0c40 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -21,8 +21,10 @@ #include #include +#ifdef CONFIG_FINEGRAINED_THP #include #include +#endif #include "internal.h" enum scan_result { diff --git a/mm/memory.c b/mm/memory.c index 08336046..bdf18e9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -82,8 +82,13 @@ #include #include #include +#ifdef CONFIG_FINEGRAINED_THP #include #include +#else +#include +#include +#endif #include "pgalloc-track.h" #include "internal.h" diff --git a/mm/mmap.c b/mm/mmap.c index 02eb014..cca7268 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -52,7 +52,11 @@ #include #include #include +#ifdef CONFIG_FINEGRAINED_THP #include +#else +#include +#endif #define CREATE_TRACE_POINTS #include -- 2.7.4 From 77427aa27cc83043be034d102525002f50bbf05f Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Mon, 13 Sep 2021 12:19:39 +0900 Subject: [PATCH 02/16] mm: THP: workaround: fix a build error occurred if FINEGRAINED_THP is disabled Fixes: 04519e317c60 ('mm: THP: workaround: only allow including specific headers for FINEGRAINED_THP configured cases') Change-Id: Iec1678cb5c45708865a1d18fef88807e7fd47870 Signed-off-by: Sung-hun Kim --- mm/khugepaged.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 34f0c40..99cc150 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -24,6 +24,9 @@ #ifdef CONFIG_FINEGRAINED_THP #include #include +#else +#include +#include #endif #include "internal.h" -- 2.7.4 From 8690fa3fc22ac74304f26441798e540f8f929926 Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Wed, 15 Sep 2021 13:28:04 +0900 Subject: [PATCH 03/16] mm, meminfo: modify page counting Two counters, nr_phys_huge_pmd_pages and nr_phys_cont_pte_pages, are counted by different units. This patch enforces two counters counted by pages, not huge pages. Change-Id: I1fcb6a1a9c3a60c956b861e79ec3714a33004991 Signed-off-by: Sung-hun Kim --- fs/proc/meminfo.c | 2 +- mm/huge_memory.c | 4 ++-- mm/memory.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 014f197..abc072ba 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -155,7 +155,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) phys_cont_pte_pages()); #endif /* CONFIG_FINEGRAINED_THP */ show_val_kb(m, "PhysPmdMapped: ", - phys_huge_pmd_pages() * HPAGE_PMD_NR); + phys_huge_pmd_pages()); #endif #ifdef CONFIG_CMA show_val_kb(m, "CmaTotal: ", totalcma_pages); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 20ea663..23d21e5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1670,7 +1670,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (vma_is_special_huge(vma)) { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); - atomic_long_dec(&nr_phys_huge_pmd_pages); + atomic_long_add(-HPAGE_PMD_NR, &nr_phys_huge_pmd_pages); spin_unlock(ptl); if (is_huge_zero_pmd(orig_pmd)) tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); @@ -2281,7 +2281,7 @@ repeat: pmd_t orig_pmd; orig_pmd = pmdp_huge_get_and_clear_full(vma, haddr, pmd, 0); - atomic_long_dec(&nr_phys_huge_pmd_pages); + atomic_long_add(-HPAGE_PMD_NR, &nr_phys_huge_pmd_pages); thp_remap_pte_range_locked(mm, pmd, haddr, haddr + HPAGE_PMD_SIZE, pmd_pfn(orig_pmd), diff --git a/mm/memory.c b/mm/memory.c index bdf18e9..e6deee2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2287,7 +2287,7 @@ static int remap_try_huge_pmd(struct mm_struct *mm, pmd_t *pmd, unsigned long ad ret = pmd_set_huge(pmd, phys_addr, prot); spin_unlock(ptl); - atomic_long_inc(&nr_phys_huge_pmd_pages); + atomic_long_add(HPAGE_PMD_NR, &nr_phys_huge_pmd_pages); return ret; } -- 2.7.4 From 93cdd04abf4b2522392f658de1ed35a602c0e945 Mon Sep 17 00:00:00 2001 From: Seung-Woo Kim Date: Wed, 15 Sep 2021 15:07:00 +0900 Subject: [PATCH 04/16] Partially Revert "brcmfmac: p2p: Deal with set but unused variables" This partially reverts commit 2de64ca7c9fadd32b261530592db4a6adbfcb53f. The commit 61325dc073e2 ("Revert "brcmfmac: move configuration of probe request IEs"") requires vif set with p2p interface, but commit 2de64ca7c9fa removes setting. Partially revert the commit to support p2p usage with p2p interface. Change-Id: Ia90e256c3d10396b1018e3aec8145139accfb39e Reported-by: Jiung Yu Signed-off-by: Seung-Woo Kim --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c index b08d2ca..942bd53 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c @@ -912,6 +912,8 @@ int brcmf_p2p_scan_prep(struct wiphy *wiphy, if (err) return err; + vif = p2p->bss_idx[P2PAPI_BSSCFG_DEVICE].vif; + /* override .run_escan() callback. */ cfg->escan_info.run = brcmf_p2p_run_escan; } -- 2.7.4 From be97c7c0fb8de0bc2dfc7bf82bf02bcc11142ae0 Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Wed, 15 Sep 2021 16:39:13 +0900 Subject: [PATCH 05/16] mm, thp: hide remap_try_huge_pmd for the THP-disabled kernel Since remap_try_huge_pmd is dependent on the kernel configuration CONFIG_TRANSPARENT_HUGEPAGE, it should be hidden when the kernel configuration is disabled. Fixes: 8690fa3fc22a ('mm, meminfo: modify page counting') Change-Id: Iae9efb2edf6cd563c794af68bea7987110a5b2da Signed-off-by: Sung-hun Kim --- mm/memory.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index e6deee2..f1e5eb9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2259,7 +2259,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, return err; } #endif /* CONFIG_FINEGRAINED_THP */ - +#ifdef CONFIG_TRANSPARENT_HUGEPAGE static int remap_try_huge_pmd(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) @@ -2291,6 +2291,7 @@ static int remap_try_huge_pmd(struct mm_struct *mm, pmd_t *pmd, unsigned long ad return ret; } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, @@ -2308,10 +2309,11 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, do { next = pmd_addr_end(addr, end); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE if (remap_try_huge_pmd(mm, pmd, addr, next, pfn + (addr >> PAGE_SHIFT), prot)) continue; - +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ err = remap_pte_range(mm, pmd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) -- 2.7.4 From 78df7c9b0b1662288349db6cd2de55d76e56929a Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Thu, 16 Sep 2021 13:44:25 +0900 Subject: [PATCH 06/16] mm, thp, migrate: handling migration of 64KB hugepages When a 64KB hugepage is migrated, it should be properly handled since it is different from other normal page mappings. The kernel should handle a set of sequential 16 page mappings at once. If not, the kernel can mishandle map counts of a compound page (that is, a set of pages). It can be a source of kernel bugs and the bug is easily reproduced on low-memory devices. This patch deals with the migration of 64KB hugepages. Fixes: 7d5372737d34 ('mm: THP: introducing a fine-grained transparent hugepage technique for ARM64 architecture') Change-Id: I50a5d4e9a263e7dcbded15c982f57c15a3a48f39 Signed-off-by: Sung-hun Kim --- arch/arm64/mm/huge_memory.c | 75 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/swapops.h | 21 +++++++++++++ mm/migrate.c | 17 +++++----- mm/rmap.c | 8 +++++ 4 files changed, 111 insertions(+), 10 deletions(-) diff --git a/arch/arm64/mm/huge_memory.c b/arch/arm64/mm/huge_memory.c index 2ef1a21..1073fde 100644 --- a/arch/arm64/mm/huge_memory.c +++ b/arch/arm64/mm/huge_memory.c @@ -1087,4 +1087,79 @@ void split_huge_pte_address(struct vm_area_struct *vma, unsigned long address, __split_huge_pte(vma, pmd, pte, haddr, freeze, page); } + +void set_huge_pte_migration_entry( + struct page_vma_mapped_walk *pvmw, + struct page *page) +{ + int i; + struct vm_area_struct *vma = pvmw->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long address = pvmw->address; + pte_t pteval, *pte; + swp_entry_t entry; + pte_t pteswp; + struct page *_page = page; + + if (!(pvmw->pmd && pvmw->pte)) + return; + + flush_cache_range(vma, address, address + HPAGE_CONT_PTE_SIZE); + pte = pvmw->pte; + + //arch_set_huge_pte_at(mm, address, pvmw->pte, ptee); + for (i = 0, pte = pvmw->pte; i < HPAGE_CONT_PTE_NR; i++, pte++) { + pteval = ptep_invalidate(vma, address, pte); + if (pte_dirty(pteval)) + set_page_dirty(_page); + entry = make_migration_entry(page, pte_write(pteval)); + pteswp = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + pteswp = pte_swp_mksoft_dirty(pteswp); + set_pte_at(mm, address, pte, pteswp); + _page++; + address += PAGE_SIZE; + } + + pvmw->pte = pte; + pvmw->address = address; + + page_remove_rmap(page, true); + put_page(page); +} + +void remove_migration_huge_pte( + struct page_vma_mapped_walk *pvmw, struct page *new) +{ + struct vm_area_struct *vma = pvmw->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long address = pvmw->address; + unsigned long mmun_start = address & HPAGE_CONT_PTE_MASK; + pte_t ptee; + swp_entry_t entry; + + if (!(pvmw->pmd && !pvmw->pte)) + return; + + entry = pmd_to_swp_entry(*pvmw->pmd); + get_page(new); + ptee = pte_mkold(arch_make_huge_pte(new, vma)); + if (pte_swp_soft_dirty(*pvmw->pte)) + ptee = pte_mksoft_dirty(ptee); + if (is_write_migration_entry(entry)) + ptee = maybe_mkwrite(ptee, vma); + + flush_cache_range(vma, mmun_start, mmun_start + HPAGE_CONT_PTE_SIZE); + if (PageAnon(new)) + page_add_anon_rmap(new, vma, mmun_start, true); + else + page_add_file_rmap(new, true); + + arch_set_huge_pte_at(mm, mmun_start, pvmw->pte, ptee, 0); + if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new)) + mlock_vma_page(new); + pvmw->address = address + HPAGE_CONT_PTE_SIZE; + pvmw->pte = pvmw->pte + HPAGE_CONT_PTE_NR; + update_mmu_cache_pmd(vma, address, pvmw->pmd); +} #endif /* CONFIG_FINEGRAINED_THP */ diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 71aa4b7..bdfbc8e 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -250,6 +250,14 @@ extern void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new); +#ifdef CONFIG_FINEGRAINED_THP +extern void set_huge_pte_migration_entry(struct page_vma_mapped_walk *pvmw, + struct page *page); + +extern void remove_migration_huge_pte(struct page_vma_mapped_walk *pvmw, + struct page *new); +#endif + extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd); static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) @@ -292,6 +300,19 @@ static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, { BUILD_BUG(); } +#ifdef CONFIG_FINEGRAINED_THP +static inline void set_huge_pte_migration_entry(struct page_vma_mapped_walk *pvmw, + struct page *page) +{ + BUILD_BUG(); +} + +static inline void remove_migration_huge_pte(struct page_vma_mapped_walk *pvmw, + struct page *new) +{ + BUILD_BUG(); +} +#endif static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { } diff --git a/mm/migrate.c b/mm/migrate.c index b16e340..de299c3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -230,6 +230,13 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, remove_migration_pmd(&pvmw, new); continue; } +#ifdef CONFIG_FINEGRAINED_THP + if (PageTransHuge(page) && pte_cont(*pvmw.pte)) { + VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); + remove_migration_huge_pte(&pvmw, new); + continue; + } +#endif /* CONFIG_FINEGRAINED_THP */ #endif get_page(new); @@ -266,16 +273,6 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, page_dup_rmap(new, true); } else #endif -#ifdef CONFIG_FINEGRAINED_THP - if (PageTransHuge(new)) { - pte = pte_mkcont(pte_mkhuge(pte)); - arch_set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte, 0); - if (PageAnon(new)) - page_add_anon_rmap(new, vma, pvmw.address, true); - else - page_dup_rmap(new, true); - } else -#endif /* CONFIG_FINEGRAINED_THP */ { set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); diff --git a/mm/rmap.c b/mm/rmap.c index 64de8c1..0eca948 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1480,6 +1480,14 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, set_pmd_migration_entry(&pvmw, page); continue; } +#ifdef CONFIG_FINEGRAINED_THP + if (pvmw.pte && pte_cont(*pvmw.pte) && (flags & TTU_MIGRATION)) { + VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); + + set_huge_pte_migration_entry(&pvmw, page); + continue; + } +#endif /* CONFIG_FINEGRAINED_THP */ #endif /* -- 2.7.4 From a3ab8122aee25b0a8dc9d5dec53bc0ce7dcb6fc0 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 22 Sep 2021 15:29:33 +0200 Subject: [PATCH 07/16] mm: thp: count 64k shmem pages separately 64k THP for shmem needs separate counters, otherwise ShmemHugePages entry in /proc/meminfo incorrectly shows both 2M and 64k THPs as 2M ones. Signed-off-by: Marek Szyprowski Change-Id: I460ea9f4e9c2f84bb066f68bfb6a291183416bb1 --- fs/proc/meminfo.c | 4 ++++ include/linux/mmzone.h | 3 +++ mm/filemap.c | 10 +++++++++- mm/huge_memory.c | 11 ++++++++--- mm/khugepaged.c | 7 +++++++ mm/page_alloc.c | 6 ++++++ mm/shmem.c | 5 +++++ mm/vmstat.c | 3 +++ 8 files changed, 45 insertions(+), 4 deletions(-) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index abc072ba..e619d5b 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -136,6 +136,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #endif /* CONFIG_FINEGRAINED_THP */ show_val_kb(m, "ShmemHugePages: ", global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); +#ifdef CONFIG_FINEGRAINED_THP + show_val_kb(m, "Shmem64KBPages: ", + global_node_page_state(NR_SHMEM_64KB_THPS) * HPAGE_CONT_PTE_NR); +#endif /* CONFIG_FINEGRAINED_THP */ show_val_kb(m, "ShmemPmdMapped: ", global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); #ifdef CONFIG_FINEGRAINED_THP diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 26df92e..5b4424e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -192,6 +192,9 @@ enum node_stat_item { NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_SHMEM_THPS, +#ifdef CONFIG_FINEGRAINED_THP + NR_SHMEM_64KB_THPS, +#endif /* CONFIG_FINEGRAINED_THP */ NR_SHMEM_PMDMAPPED, #ifdef CONFIG_FINEGRAINED_THP NR_SHMEM_PTEMAPPED, diff --git a/mm/filemap.c b/mm/filemap.c index 02099ca..4ef7518 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -203,8 +203,16 @@ static void unaccount_page_cache_page(struct address_space *mapping, __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr); if (PageSwapBacked(page)) { __mod_lruvec_page_state(page, NR_SHMEM, -nr); - if (PageTransHuge(page)) + if (PageTransHuge(page)) { +#ifdef CONFIG_FINEGRAINED_THP + if (thp_nr_pages(page) == HPAGE_PMD_NR) + __dec_node_page_state(page, NR_SHMEM_THPS); + else + __dec_node_page_state(page, NR_SHMEM_64KB_THPS); +#else /* CONFIG_FINEGRAINED_THP */ __dec_node_page_state(page, NR_SHMEM_THPS); +#endif /* CONFIG_FINEGRAINED_THP */ + } } else if (PageTransHuge(page)) { #ifdef CONFIG_FINEGRAINED_THP if (thp_nr_pages(page) == HPAGE_PMD_NR) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 23d21e5..e2ab0df 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2824,9 +2824,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } spin_unlock(&ds_queue->split_queue_lock); if (mapping) { - if (PageSwapBacked(head)) - __dec_node_page_state(head, NR_SHMEM_THPS); - else { + if (PageSwapBacked(head)) { +#ifdef CONFIG_FINEGRAINED_THP + if (thp_nr_pages(head) == HPAGE_CONT_PTE_NR) + __dec_node_page_state(head, NR_SHMEM_64KB_THPS); + else +#endif /* CONFIG_FINEGRAINED_THP */ + __dec_node_page_state(head, NR_SHMEM_THPS); + } else { #ifdef CONFIG_FINEGRAINED_THP if (thp_nr_pages(head) == HPAGE_CONT_PTE_NR) __dec_node_page_state(head, NR_FILE_64KB_THPS); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 99cc150..39ee0fb 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2337,7 +2337,14 @@ out_unlock: } if (is_shmem) +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB) + __inc_node_page_state(new_page, NR_SHMEM_64KB_THPS); + else + __inc_node_page_state(new_page, NR_SHMEM_THPS); +#else /* CONFIG_FINEGRAINED_THP */ __inc_node_page_state(new_page, NR_SHMEM_THPS); +#endif /* CONFIG_FINEGRAINED_THP */ else { #ifdef CONFIG_FINEGRAINED_THP if (hpage_type == THP_TYPE_64KB) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 327e033..030b94c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5493,6 +5493,9 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " shmem:%lukB" #ifdef CONFIG_TRANSPARENT_HUGEPAGE " shmem_thp: %lukB" +#ifdef CONFIG_FINEGRAINED_THP + " shmem_64kb_thp: %lukB" +#endif /* CONFIG_FINEGRAINED_THP */ " shmem_pmdmapped: %lukB" " anon_thp: %lukB" #endif @@ -5517,6 +5520,9 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_SHMEM)), #ifdef CONFIG_TRANSPARENT_HUGEPAGE K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), +#ifdef CONFIG_FINEGRAINED_THP + K(node_page_state(pgdat, NR_SHMEM_64KB_THPS) * HPAGE_CONT_PTE_NR), +#endif /* CONFIG_FINEGRAINED_THP */ K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR), K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), diff --git a/mm/shmem.c b/mm/shmem.c index 01c9b74..2ec8fab 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -713,6 +713,11 @@ next: } if (PageTransHuge(page)) { count_vm_event(THP_FILE_ALLOC); +#ifdef CONFIG_FINEGRAINED_THP + if (thp_nr_pages(page) == HPAGE_CONT_PTE_NR) + __inc_node_page_state(page, NR_SHMEM_64KB_THPS); + else +#endif /* CONFIG_FINEGRAINED_THP */ __inc_node_page_state(page, NR_SHMEM_THPS); } mapping->nrpages += nr; diff --git a/mm/vmstat.c b/mm/vmstat.c index 42f5ef2..feb65b0 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1200,6 +1200,9 @@ const char * const vmstat_text[] = { "nr_writeback_temp", "nr_shmem", "nr_shmem_hugepages", +#ifdef CONFIG_FINEGRAINED_THP + "nr_shmem_64kb_hugepages", +#endif "nr_shmem_pmdmapped", #ifdef CONFIG_FINEGRAINED_THP "nr_shmem_ptemapped", -- 2.7.4 From 1cb2541ce6d35eb3f3d0dd7e2749c11ba267c2ba Mon Sep 17 00:00:00 2001 From: Adrian Szyndela Date: Thu, 30 Sep 2021 15:44:57 +0900 Subject: [PATCH 08/16] kdbus: don't unlink interrupted replies When a signal breaks a synchronous call, and a reply is received before restart is executed, then the reply is unlinked and the restarted call can't pick it up anymore. This commit leaves replies linked if they were interrupted. Change-Id: I89c353ecc2bae83f7c12bb199480423d743ed5bc Signed-off-by: Adrian Szyndela --- ipc/kdbus/connection.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ipc/kdbus/connection.c b/ipc/kdbus/connection.c index 02deba36..6479c04 100644 --- a/ipc/kdbus/connection.c +++ b/ipc/kdbus/connection.c @@ -1135,7 +1135,8 @@ static int kdbus_conn_reply(struct kdbus_conn *src, if (reply) { if (reply->sync) wake = kdbus_reply_ref(reply); - kdbus_reply_unlink(reply); + if (!reply->interrupted) + kdbus_reply_unlink(reply); } mutex_unlock(&dst->lock); -- 2.7.4 From 5b61243f1343850f11e583e0e8dd4ffc37bef580 Mon Sep 17 00:00:00 2001 From: Adrian Szyndela Date: Thu, 7 Oct 2021 11:27:38 +0200 Subject: [PATCH 09/16] kdbus: don't unlink _synchronous_ replies Keeping only interrupted synchronous calls linked is not enough. If a reply comes just after the signal, before marking the reply structure as interrupted, then the reply is unlinked and the restarted call can't pick it up anymore. This commit leaves all synchronous replies linked. This way: - non-synchronous calls work as before the changes - replies are unlinked by the replier in kdbus_conn_reply(); - replies for synchronous calls are unlinked by the caller in kdbus_conn_wait_reply(). Change-Id: If162f96a14d51d6a4475fe5c55039dc92236b19a Signed-off-by: Adrian Szyndela --- ipc/kdbus/connection.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipc/kdbus/connection.c b/ipc/kdbus/connection.c index 6479c04..a40be5f 100644 --- a/ipc/kdbus/connection.c +++ b/ipc/kdbus/connection.c @@ -1135,7 +1135,7 @@ static int kdbus_conn_reply(struct kdbus_conn *src, if (reply) { if (reply->sync) wake = kdbus_reply_ref(reply); - if (!reply->interrupted) + else kdbus_reply_unlink(reply); } mutex_unlock(&dst->lock); -- 2.7.4 From f90f5c8c5d68f6a212e6f62e329b5d7b75b6b9d5 Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Thu, 30 Sep 2021 09:37:45 +0900 Subject: [PATCH 10/16] meminfo, thp: modify ifdef coverage to remove unexpected variable printing If the system uses fTHP, the user can show statistics of hugepage-mapped CMA pages via /proc/meminfo. Otherwise, the user does not need to aware of such variables. This patch removes such exported variables when fTHP-disabled kernel is used. Change-Id: Iaff9dd8d81da1a3caa60959b9c5c1f44544f30d4 Signed-off-by: Sung-hun Kim --- fs/proc/meminfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index e619d5b..7cc0633 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -157,9 +157,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_node_page_state(NR_FILE_PTEMAPPED) * HPAGE_CONT_PTE_NR); show_val_kb(m, "PhysCPteMapped: ", phys_cont_pte_pages()); -#endif /* CONFIG_FINEGRAINED_THP */ show_val_kb(m, "PhysPmdMapped: ", phys_huge_pmd_pages()); +#endif /* CONFIG_FINEGRAINED_THP */ #endif #ifdef CONFIG_CMA show_val_kb(m, "CmaTotal: ", totalcma_pages); -- 2.7.4 From 8e4778dccf771e3f2caa982731b6056ecf8b8779 Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Thu, 30 Sep 2021 12:28:56 +0900 Subject: [PATCH 11/16] mm, thp: modify coverage of CONFIG_FINEGRAINED_THP macro Some codes should be applied only in fTHP-enabled kernel. This patch rearranges the coverage of CONFIG_FINEGRAINED_THP macro. Change-Id: I0541c36369f8bd7a8fe4b8868c51dc0e6879f100 Signed-off-by: Sung-hun Kim --- mm/huge_memory.c | 7 +++++- mm/memory.c | 66 +++++++++++++++++++++++++++----------------------------- 2 files changed, 38 insertions(+), 35 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e2ab0df..efc73a0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2194,6 +2194,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } } +#ifdef CONFIG_FINEGRAINED_THP static int thp_pte_alloc_locked(struct mm_struct *mm, pmd_t *pmd) { pgtable_t new = pte_alloc_one(mm); @@ -2248,6 +2249,7 @@ static inline pgprot_t thp_pmd_pgprot(pmd_t pmd) return __pgprot(pmd_val(pfn_pmd(pfn, __pgprot(0))) ^ pmd_val(pmd)); } +#endif void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct page *page) @@ -2275,6 +2277,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, } repeat: +#ifdef CONFIG_FINEGRAINED_THP if (pmd_trans_huge(*pmd) && !vm_normal_page_pmd(vma, address, *pmd)) { struct mm_struct *mm = vma->vm_mm; unsigned long haddr = address & HPAGE_PMD_MASK; @@ -2287,7 +2290,9 @@ repeat: pmd_pfn(orig_pmd), thp_pmd_pgprot(orig_pmd)); goto out; - } else if (pmd_trans_huge(*pmd) && vm_normal_page_pmd(vma, address, *pmd)) { + } else +#endif /* CONFIG_FINEGRAINED_THP */ + if (pmd_trans_huge(*pmd) && vm_normal_page_pmd(vma, address, *pmd)) { if (!page) { page = pmd_page(*pmd); /* diff --git a/mm/memory.c b/mm/memory.c index f1e5eb9..eeb7825 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2229,37 +2229,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, { return arch_remap_pte_range(mm, pmd, addr, end, pfn, prot); } -#else /* CONFIG_FINEGRAINED_THP */ -static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, unsigned long end, - unsigned long pfn, pgprot_t prot) -{ - pte_t *pte, *mapped_pte; - spinlock_t *ptl; - int err = 0; - mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); - if (!pte) - return -ENOMEM; - arch_enter_lazy_mmu_mode(); - do { - BUG_ON(!pte_none(*pte)); - if (!pfn_modify_allowed(pfn, prot)) { - err = -EACCES; - break; - } - - set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); - pfn++; - pte++; - addr += PAGE_SIZE; - } while (addr != end); - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(mapped_pte, ptl); - return err; -} -#endif /* CONFIG_FINEGRAINED_THP */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE static int remap_try_huge_pmd(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) @@ -2291,7 +2261,36 @@ static int remap_try_huge_pmd(struct mm_struct *mm, pmd_t *pmd, unsigned long ad return ret; } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#else /* CONFIG_FINEGRAINED_THP */ +static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pte_t *pte, *mapped_pte; + spinlock_t *ptl; + int err = 0; + + mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; + arch_enter_lazy_mmu_mode(); + do { + BUG_ON(!pte_none(*pte)); + if (!pfn_modify_allowed(pfn, prot)) { + err = -EACCES; + break; + } + + set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); + pfn++; + pte++; + addr += PAGE_SIZE; + } while (addr != end); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(mapped_pte, ptl); + return err; +} +#endif /* CONFIG_FINEGRAINED_THP */ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, @@ -2308,12 +2307,11 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#ifdef CONFIG_FINEGRAINED_THP if (remap_try_huge_pmd(mm, pmd, addr, next, pfn + (addr >> PAGE_SHIFT), prot)) continue; -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif /* CONFIG_FINEGRAINED_THP */ err = remap_pte_range(mm, pmd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) -- 2.7.4 From 40422bc2e6cd14dc3c96f56f4b39937072607cfe Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Fri, 1 Oct 2021 13:01:01 +0900 Subject: [PATCH 12/16] mm: thp: add acquisition/release of a lock to guarantee consistent locking state arm64_wp_huge_pte should acquire a lock before return to keep the lock semantics of the caller. To guarantee this, add a new lock acquisition and a new lock release statements in proper positions. Change-Id: I81fb8afc37f54bce83f353ca6b6894e70ef86934 Signed-off-by: Sung-hun Kim --- arch/arm64/mm/huge_memory.c | 6 +++++- mm/memory.c | 8 +++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/huge_memory.c b/arch/arm64/mm/huge_memory.c index 1073fde..4dbb11d 100644 --- a/arch/arm64/mm/huge_memory.c +++ b/arch/arm64/mm/huge_memory.c @@ -470,8 +470,12 @@ vm_fault_t arm64_wp_huge_pte(struct vm_fault *vmf, pte_t orig_pte) pte_t *hpte_p; if (vma_is_anonymous(vmf->vma)) { + int ret; + spin_unlock(vmf->ptl); - return arm64_do_huge_pte_wp_page(vmf, orig_pte); + ret = arm64_do_huge_pte_wp_page(vmf, orig_pte); + spin_lock(vmf->ptl); + return ret; } VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); diff --git a/mm/memory.c b/mm/memory.c index eeb7825..a82193a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4591,8 +4591,14 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) if (!pte_write(entry)) { int ret = arch_do_wp_page(vmf, entry); - if (!(ret & VM_FAULT_FALLBACK)) + if (!(ret & VM_FAULT_FALLBACK)) { + /* + * arch_do_wp_page returns + * VM_FAULT value with spin lock acquisition. + */ + spin_unlock(vmf->ptl); return ret; + } return do_wp_page(vmf); } if (arch_huge_pte_set_accessed(vmf, entry)) -- 2.7.4 From 60c6791df949e6ef790a63552e283a747f9d636d Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Fri, 1 Oct 2021 13:06:07 +0900 Subject: [PATCH 13/16] mm, thp: do not set PTE_SPECIAL for huge zero page In previous version of the kernel, a huge zero page is remapped to normal pte mappings with PTE_SPECIAL flag when the split of hugepage is requested. It makes a buggy situation when the kernel tries to find a page with vm_normal_page. This patch resolves this problem by adding a condition to if- statement. Change-Id: I62946d3c3e92be309ccbe987f24a33503a7e23dc Signed-off-by: Sung-hun Kim --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index efc73a0..18bdc8c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2278,7 +2278,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, repeat: #ifdef CONFIG_FINEGRAINED_THP - if (pmd_trans_huge(*pmd) && !vm_normal_page_pmd(vma, address, *pmd)) { + if (pmd_trans_huge(*pmd) && !vm_normal_page_pmd(vma, address, *pmd) && !is_huge_zero_pmd(*pmd)) { struct mm_struct *mm = vma->vm_mm; unsigned long haddr = address & HPAGE_PMD_MASK; pmd_t orig_pmd; -- 2.7.4 From 44021aaaef7f4e80d3ce5886bac8c8ac378aee93 Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Fri, 1 Oct 2021 14:15:51 +0900 Subject: [PATCH 14/16] mm: thp: khugepaged: flush tlb range to prevent concurrent memory accesses When khugepaged creates a hugepage while a user accesses memory, khugepaged should prohibit current accesses to scanning area because it can make a race condition. Before creating a hugepage, khugepaged holds a lock of mm_struct then flushes tlbs of scanning range. For 64KB hugepage, however, it does not flush tlbs before it installs a new hugepage into the page table. In this case, the user can lose their progress because the user process still have memory map in its TLB entries. By flushing TLBs right after acquiring mmap_lock, the process can stall before reusing their memory contents. Change-Id: I408662d69fa68e6210be8ad0b585943bfb8894e8 Signed-off-by: Sung-hun Kim --- mm/khugepaged.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 39ee0fb..0622868 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1384,7 +1384,7 @@ static void collapse_huge_page(struct mm_struct *mm, * __collapse_huge_page_isolate and __collapse_huge_page_copy * to fail, __collapse_huge_page_copy also clears ptes */ - ; + flush_tlb_range(vma, address, address + HPAGE_CONT_PTE_SIZE); else #endif /* CONFIG_FINEGRAINED_THP */ _pmd = pmdp_collapse_flush(vma, address, pmd); -- 2.7.4 From 91192eaf1633c6b6a6cada09007b4e788fdb3550 Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Wed, 6 Oct 2021 13:33:00 +0900 Subject: [PATCH 15/16] thp, khugepaged: skip retracting page table if a 64KB hugepage mapping is already established When khugepaged tries to retract page table entries, an user can try to handle a page fault. To prevent concurrent modifications for an address space, an mmap_lock is used. If khugepaged failed to get lock, it delays page table retraction to the next interation. Previously, however, khugepaged assumes that page table entries have normal mapping, especially for 64KB hugepages. This patch checks that a mapping is already established for 64KB hugepage appropriately. If it is khugepaged just skips a retraction. Change-Id: I961e21e65e2ae09df43488582d1469e250059909 Signed-off-by: Sung-hun Kim --- mm/khugepaged.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 0622868..692b2fc 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1850,6 +1850,12 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) goto drop_hpage; start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); +#ifdef CONFIG_FINEGRAINED_THP + if (pte_cont(*start_pte)) { + pte_unmap_unlock(start_pte, ptl); + goto drop_hpage; + } +#endif /* step 1: check all mapped PTEs are to the right huge page */ for (i = 0, addr = haddr, pte = start_pte; -- 2.7.4 From bca41ebfc598858807071491cdb85f61773d8a77 Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Thu, 7 Oct 2021 16:57:39 +0900 Subject: [PATCH 16/16] mm, thp: preventing hugepage creation for read-write file pages Sometimes, an user process incurs writes on file pages which has VM_DENYWRITE flag in its vma->vm_flags (of course, the vma has VM_WRITE flags too). In this case, the kernel creates a new page by a COW fault, but it is an unexpected behaviour for 64KB file hugepages. This patch disallows scanning of read-write file pages to prevent unexpected buggy behaviours. Change-Id: I28c1da7f7ad4be55be5607316b29a2978896fcb9 Signed-off-by: Sung-hun Kim --- arch/arm64/mm/finegrained_thp.c | 2 +- mm/khugepaged.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/finegrained_thp.c b/arch/arm64/mm/finegrained_thp.c index 5ebb4ac..570747c 100644 --- a/arch/arm64/mm/finegrained_thp.c +++ b/arch/arm64/mm/finegrained_thp.c @@ -17,7 +17,7 @@ bool arm64_hugepage_vma_file_check(struct vm_area_struct *vma, { /* Read-only file mappings need to be aligned for THP to work. */ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file && - (vm_flags & VM_DENYWRITE)) { + (vm_flags & VM_DENYWRITE) && !(vm_flags & VM_WRITE)) { return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, nr_pages); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 692b2fc..21105b0 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -513,7 +513,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, return true; /* Read-only file mappings need to be aligned for THP to work. */ else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file && - (vm_flags & VM_DENYWRITE)) { + (vm_flags & VM_DENYWRITE) && !(vm_flags & VM_WRITE)) { return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, HPAGE_PMD_NR); } -- 2.7.4