From f384796c40dc55b3dba25e0ee9c1afd98c6d24d1 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 26 Mar 2018 15:34:48 +0530 Subject: [PATCH] powerpc/mm: Add support for handling > 512TB address in SLB miss For addresses above 512TB we allocate additional mmu contexts. To make it all easy, addresses above 512TB are handled with IR/DR=1 and with stack frame setup. The mmu_context_t is also updated to track the new extended_ids. To support upto 4PB we need a total 8 contexts. Signed-off-by: Aneesh Kumar K.V [mpe: Minor formatting tweaks and comment wording, switch BUG to WARN in get_ea_context().] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hash-4k.h | 6 ++ arch/powerpc/include/asm/book3s/64/hash-64k.h | 6 ++ arch/powerpc/include/asm/book3s/64/mmu.h | 33 +++++++- arch/powerpc/include/asm/mmu_context.h | 39 ++++++++++ arch/powerpc/include/asm/processor.h | 6 ++ arch/powerpc/kernel/exceptions-64s.S | 11 ++- arch/powerpc/kernel/traps.c | 12 --- arch/powerpc/mm/copro_fault.c | 2 +- arch/powerpc/mm/hash_utils_64.c | 4 +- arch/powerpc/mm/mmu_context_book3s64.c | 15 +++- arch/powerpc/mm/pgtable-hash64.c | 2 +- arch/powerpc/mm/slb.c | 108 ++++++++++++++++++++++++++ arch/powerpc/mm/slb_low.S | 11 ++- arch/powerpc/mm/slice.c | 15 +++- arch/powerpc/mm/tlb_hash64.c | 2 +- 15 files changed, 245 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 4cbec81..4b54230 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -11,6 +11,12 @@ #define H_PUD_INDEX_SIZE 9 #define H_PGD_INDEX_SIZE 9 +/* + * Each context is 512TB. But on 4k we restrict our max TASK size to 64TB + * Hence also limit max EA bits to 64TB. + */ +#define MAX_EA_BITS_PER_CONTEXT 46 + #ifndef __ASSEMBLY__ #define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE) #define H_PMD_TABLE_SIZE (sizeof(pmd_t) << H_PMD_INDEX_SIZE) diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index fcca6de..f7a4f5c 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -8,6 +8,12 @@ #define H_PGD_INDEX_SIZE 8 /* + * Each context is 512TB size. SLB miss for first context/default context + * is handled in the hotpath. + */ +#define MAX_EA_BITS_PER_CONTEXT 49 + +/* * 64k aligned address free up few of the lower bits of RPN for us * We steal that here. For more deatils look at pte_pfn/pfn_pte() */ diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index c8c836e..5094696 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -91,7 +91,18 @@ struct slice_mask { }; typedef struct { - mm_context_id_t id; + union { + /* + * We use id as the PIDR content for radix. On hash we can use + * more than one id. The extended ids are used when we start + * having address above 512TB. We allocate one extended id + * for each 512TB. The new id is then used with the 49 bit + * EA to build a new VA. We always use ESID_BITS_1T_MASK bits + * from EA and new context ids to build the new VAs. + */ + mm_context_id_t id; + mm_context_id_t extended_id[TASK_SIZE_USER64/TASK_CONTEXT_SIZE]; + }; u16 user_psize; /* page size index */ /* Number of bits in the mm_cpumask */ @@ -196,5 +207,25 @@ extern void radix_init_pseries(void); static inline void radix_init_pseries(void) { }; #endif +static inline int get_ea_context(mm_context_t *ctx, unsigned long ea) +{ + int index = ea >> MAX_EA_BITS_PER_CONTEXT; + + if (likely(index < ARRAY_SIZE(ctx->extended_id))) + return ctx->extended_id[index]; + + /* should never happen */ + WARN_ON(1); + return 0; +} + +static inline unsigned long get_user_vsid(mm_context_t *ctx, + unsigned long ea, int ssize) +{ + unsigned long context = get_ea_context(ctx, ea); + + return get_vsid(context, ea, ssize); +} + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */ diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 3a15b6db..1835ca1 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -60,12 +60,51 @@ extern int hash__alloc_context_id(void); extern void hash__reserve_context_id(int id); extern void __destroy_context(int context_id); static inline void mmu_context_init(void) { } + +static inline int alloc_extended_context(struct mm_struct *mm, + unsigned long ea) +{ + int context_id; + + int index = ea >> MAX_EA_BITS_PER_CONTEXT; + + context_id = hash__alloc_context_id(); + if (context_id < 0) + return context_id; + + VM_WARN_ON(mm->context.extended_id[index]); + mm->context.extended_id[index] = context_id; + return context_id; +} + +static inline bool need_extra_context(struct mm_struct *mm, unsigned long ea) +{ + int context_id; + + context_id = get_ea_context(&mm->context, ea); + if (!context_id) + return true; + return false; +} + #else extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk); extern unsigned long __init_new_context(void); extern void __destroy_context(unsigned long context_id); extern void mmu_context_init(void); +static inline int alloc_extended_context(struct mm_struct *mm, + unsigned long ea) +{ + /* non book3s_64 should never find this called */ + WARN_ON(1); + return -ENOMEM; +} + +static inline bool need_extra_context(struct mm_struct *mm, unsigned long ea) +{ + return false; +} #endif #if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) && defined(CONFIG_PPC_RADIX_MMU) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 01299cdc..75b0844 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -119,9 +119,15 @@ void release_thread(struct task_struct *); */ #define TASK_SIZE_USER64 TASK_SIZE_512TB #define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_128TB +#define TASK_CONTEXT_SIZE TASK_SIZE_512TB #else #define TASK_SIZE_USER64 TASK_SIZE_64TB #define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_64TB +/* + * We don't need to allocate extended context ids for 4K page size, because + * we limit the max effective address on this config to 64TB. + */ +#define TASK_CONTEXT_SIZE TASK_SIZE_64TB #endif /* diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 6bee20c..1a0aa70 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -621,7 +621,10 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ mtlr r10 - beq- 8f /* if bad address, make full stack frame */ + /* + * Large address, check whether we have to allocate new contexts. + */ + beq- 8f bne- cr5,2f /* if unrecoverable exception, oops */ @@ -685,7 +688,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) mr r3,r12 mfspr r11,SPRN_SRR0 mfspr r12,SPRN_SRR1 - LOAD_HANDLER(r10,bad_addr_slb) + LOAD_HANDLER(r10, large_addr_slb) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) mtspr SPRN_SRR1,r10 @@ -700,7 +703,7 @@ EXC_COMMON_BEGIN(unrecov_slb) bl unrecoverable_exception b 1b -EXC_COMMON_BEGIN(bad_addr_slb) +EXC_COMMON_BEGIN(large_addr_slb) EXCEPTION_PROLOG_COMMON(0x380, PACA_EXSLB) RECONCILE_IRQ_STATE(r10, r11) ld r3, PACA_EXSLB+EX_DAR(r13) @@ -710,7 +713,7 @@ EXC_COMMON_BEGIN(bad_addr_slb) std r10, _TRAP(r1) 2: bl save_nvgprs addi r3, r1, STACK_FRAME_OVERHEAD - bl slb_miss_bad_addr + bl slb_miss_large_addr b ret_from_except EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 1e48d15..f200bfd 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1495,18 +1495,6 @@ bail: exception_exit(prev_state); } -void slb_miss_bad_addr(struct pt_regs *regs) -{ - enum ctx_state prev_state = exception_enter(); - - if (user_mode(regs)) - _exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar); - else - bad_page_fault(regs, regs->dar, SIGSEGV); - - exception_exit(prev_state); -} - void StackOverflow(struct pt_regs *regs) { printk(KERN_CRIT "Kernel stack overflow in process %p, r1=%lx\n", diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index 697b70a..7d0945b 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -112,7 +112,7 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) return 1; psize = get_slice_psize(mm, ea); ssize = user_segment_size(ea); - vsid = get_vsid(mm->context.id, ea, ssize); + vsid = get_user_vsid(&mm->context, ea, ssize); vsidkey = SLB_VSID_USER; break; case VMALLOC_REGION_ID: diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 17fc13c..4180b89 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1267,7 +1267,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, } psize = get_slice_psize(mm, ea); ssize = user_segment_size(ea); - vsid = get_vsid(mm->context.id, ea, ssize); + vsid = get_user_vsid(&mm->context, ea, ssize); break; case VMALLOC_REGION_ID: vsid = get_kernel_vsid(ea, mmu_kernel_ssize); @@ -1532,7 +1532,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, /* Get VSID */ ssize = user_segment_size(ea); - vsid = get_vsid(mm->context.id, ea, ssize); + vsid = get_user_vsid(&mm->context, ea, ssize); if (!vsid) return; /* diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index 422be81..b75194d 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -179,6 +179,19 @@ void __destroy_context(int context_id) } EXPORT_SYMBOL_GPL(__destroy_context); +static void destroy_contexts(mm_context_t *ctx) +{ + int index, context_id; + + spin_lock(&mmu_context_lock); + for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) { + context_id = ctx->extended_id[index]; + if (context_id) + ida_remove(&mmu_context_ida, context_id); + } + spin_unlock(&mmu_context_lock); +} + #ifdef CONFIG_PPC_64K_PAGES static void destroy_pagetable_page(struct mm_struct *mm) { @@ -217,7 +230,7 @@ void destroy_context(struct mm_struct *mm) else subpage_prot_free(mm); destroy_pagetable_page(mm); - __destroy_context(mm->context.id); + destroy_contexts(&mm->context); mm->context.id = MMU_NO_CONTEXT; } diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index 469808e..a87b18c 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -320,7 +320,7 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, if (!is_kernel_addr(addr)) { ssize = user_segment_size(addr); - vsid = get_vsid(mm->context.id, addr, ssize); + vsid = get_user_vsid(&mm->context, addr, ssize); WARN_ON(vsid == 0); } else { vsid = get_kernel_vsid(addr, mmu_kernel_ssize); diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 13cfe41..66577cc 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -340,3 +341,110 @@ void slb_initialize(void) asm volatile("isync":::"memory"); } + +static void insert_slb_entry(unsigned long vsid, unsigned long ea, + int bpsize, int ssize) +{ + unsigned long flags, vsid_data, esid_data; + enum slb_index index; + int slb_cache_index; + + /* + * We are irq disabled, hence should be safe to access PACA. + */ + index = get_paca()->stab_rr; + + /* + * simple round-robin replacement of slb starting at SLB_NUM_BOLTED. + */ + if (index < (mmu_slb_size - 1)) + index++; + else + index = SLB_NUM_BOLTED; + + get_paca()->stab_rr = index; + + flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp; + vsid_data = (vsid << slb_vsid_shift(ssize)) | flags | + ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); + esid_data = mk_esid_data(ea, ssize, index); + + asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data) + : "memory"); + + /* + * Now update slb cache entries + */ + slb_cache_index = get_paca()->slb_cache_ptr; + if (slb_cache_index < SLB_CACHE_ENTRIES) { + /* + * We have space in slb cache for optimized switch_slb(). + * Top 36 bits from esid_data as per ISA + */ + get_paca()->slb_cache[slb_cache_index++] = esid_data >> 28; + get_paca()->slb_cache_ptr++; + } else { + /* + * Our cache is full and the current cache content strictly + * doesn't indicate the active SLB conents. Bump the ptr + * so that switch_slb() will ignore the cache. + */ + get_paca()->slb_cache_ptr = SLB_CACHE_ENTRIES + 1; + } +} + +static void handle_multi_context_slb_miss(int context_id, unsigned long ea) +{ + struct mm_struct *mm = current->mm; + unsigned long vsid; + int bpsize; + + /* + * We are always above 1TB, hence use high user segment size. + */ + vsid = get_vsid(context_id, ea, mmu_highuser_ssize); + bpsize = get_slice_psize(mm, ea); + insert_slb_entry(vsid, ea, bpsize, mmu_highuser_ssize); +} + +void slb_miss_large_addr(struct pt_regs *regs) +{ + enum ctx_state prev_state = exception_enter(); + unsigned long ea = regs->dar; + int context; + + if (REGION_ID(ea) != USER_REGION_ID) + goto slb_bad_addr; + + /* + * Are we beyound what the page table layout supports ? + */ + if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE) + goto slb_bad_addr; + + /* Lower address should have been handled by asm code */ + if (ea < (1UL << MAX_EA_BITS_PER_CONTEXT)) + goto slb_bad_addr; + + /* + * consider this as bad access if we take a SLB miss + * on an address above addr limit. + */ + if (ea >= current->mm->context.slb_addr_limit) + goto slb_bad_addr; + + context = get_ea_context(¤t->mm->context, ea); + if (!context) + goto slb_bad_addr; + + handle_multi_context_slb_miss(context, ea); + exception_exit(prev_state); + return; + +slb_bad_addr: + if (user_mode(regs)) + _exception(SIGSEGV, regs, SEGV_BNDERR, ea); + else + bad_page_fault(regs, ea, SIGSEGV); + exception_exit(prev_state); +} diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index 2c7c717..a83fbd2 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -75,10 +75,15 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA) */ _GLOBAL(slb_allocate) /* - * check for bad kernel/user address - * (ea & ~REGION_MASK) >= PGTABLE_RANGE + * Check if the address falls within the range of the first context, or + * if we may need to handle multi context. For the first context we + * allocate the slb entry via the fast path below. For large address we + * branch out to C-code and see if additional contexts have been + * allocated. + * The test here is: + * (ea & ~REGION_MASK) >= (1ull << MAX_EA_BITS_PER_CONTEXT) */ - rldicr. r9,r3,4,(63 - H_PGTABLE_EADDR_SIZE - 4) + rldicr. r9,r3,4,(63 - MAX_EA_BITS_PER_CONTEXT - 4) bne- 8f srdi r9,r3,60 /* get region */ diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 09ac1a7..9cd87d1 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -648,6 +648,15 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, slice_print_mask(" mask", &potential_mask); convert: + /* + * Try to allocate the context before we do slice convert + * so that we handle the context allocation failure gracefully. + */ + if (need_extra_context(mm, newaddr)) { + if (alloc_extended_context(mm, newaddr) < 0) + return -ENOMEM; + } + slice_andnot_mask(&potential_mask, &potential_mask, &good_mask); if (compat_maskp && !fixed) slice_andnot_mask(&potential_mask, &potential_mask, compat_maskp); @@ -658,10 +667,14 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, if (psize > MMU_PAGE_BASE) on_each_cpu(slice_flush_segments, mm, 1); } + return newaddr; return_addr: + if (need_extra_context(mm, newaddr)) { + if (alloc_extended_context(mm, newaddr) < 0) + return -ENOMEM; + } return newaddr; - } EXPORT_SYMBOL_GPL(slice_get_unmapped_area); diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 9b23f12..87d71dd 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -89,7 +89,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, /* Build full vaddr */ if (!is_kernel_addr(addr)) { ssize = user_segment_size(addr); - vsid = get_vsid(mm->context.id, addr, ssize); + vsid = get_user_vsid(&mm->context, addr, ssize); } else { vsid = get_kernel_vsid(addr, mmu_kernel_ssize); ssize = mmu_kernel_ssize; -- 2.7.4