mm: optimize thread stack usage on arm64 [1/1]
authortao zeng <tao.zeng@amlogic.com>
Mon, 15 Oct 2018 07:20:38 +0000 (15:20 +0800)
committerLuan Yuan <luan.yuan@amlogic.com>
Mon, 17 Dec 2018 06:52:05 +0000 (14:52 +0800)
PD#SWPL-1219

Problem:
On arm64, thread stack is 16KB for each task. If running task number
is large, this type of memory may over 40MB. It's a large amount on
small memory platform. But most case thread only use less 4KB stack.
It's waste of memory and we need optimize it.

Solution:
1. Pre-allocate a vmalloc address space for task stack;
2. Only map 1st page for stack and handle page fault in EL1
   when stack growth triggered exception;
3. handle stack switch for exception.

Verify:
p212

Change-Id: I47f511ccfa2868d982bc10a820ed6435b6d52ba9
Signed-off-by: tao zeng <tao.zeng@amlogic.com>
16 files changed:
MAINTAINERS
arch/arm64/kernel/entry.S
arch/arm64/kernel/hw_breakpoint.c
arch/arm64/kernel/smp.c
arch/arm64/kernel/stacktrace.c
arch/arm64/kernel/traps.c
drivers/amlogic/memory_ext/Kconfig
drivers/amlogic/memory_ext/Makefile
drivers/amlogic/memory_ext/vmap_stack.c [new file with mode: 0644]
drivers/amlogic/pm/gx_pm.c
fs/proc/meminfo.c
include/linux/amlogic/vmap_stack.h [new file with mode: 0644]
include/linux/sched.h
include/linux/vmalloc.h
kernel/fork.c
mm/vmalloc.c

index 0792422..f3cd225 100644 (file)
@@ -13520,6 +13520,8 @@ AMLOGIC driver for memory extend
 M: Tao Zeng <tao.zeng@amlogic.com>
 F: drivers/amlogic/memory_ext/*
 F: include/linux/amlogic/ramdump.h
+F: include/linux/amlogic/vmap_stack.h
+F: drivers/amlogic/memory_ext/vmap_stack.c
 
 AMLOGIC driver for memory extend
 M: Tao Zeng <tao.zeng@amlogic.com>
index fead713..8c07515 100644 (file)
@@ -189,7 +189,11 @@ alternative_else_nop_endif
        */
        .endm
 
+#ifdef CONFIG_AMLOGIC_VMAP
+       .macro  kernel_exit, el, swap = 0
+#else
        .macro  kernel_exit, el
+#endif /* CONFIG_AMLOGIC_VMAP */
        .if     \el != 0
        /* Restore the task's original addr_limit. */
        ldr     x20, [sp, #S_ORIG_ADDR_LIMIT]
@@ -271,6 +275,18 @@ alternative_else_nop_endif
        ldp     x26, x27, [sp, #16 * 13]
        ldp     x28, x29, [sp, #16 * 14]
        ldr     lr, [sp, #S_LR]
+#ifdef CONFIG_AMLOGIC_VMAP
+       /* restore context sp and per-cpu vmap stack */
+       .if     \swap == 1
+       stp     x19, x20, [sp]
+       mov     x20, sp
+       add     x19, x20, #S_FRAME_SIZE
+       msr     DBGWVR3_EL1, x19
+       mrs     x19, DBGWVR2_EL1
+       mov     sp,  x19
+       ldp     x19, x20, [x20]
+       .endif
+#endif /* CONFIG_AMLOGIC_VMAP */
        add     sp, sp, #S_FRAME_SIZE           // restore sp
 
        .if     \el == 0
@@ -313,8 +329,10 @@ alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0
         * Add a dummy stack frame, this non-standard format is fixed up
         * by unwind_frame()
         */
+#ifndef CONFIG_AMLOGIC_VMAP /* we need get right stack of el1_preempt */
        stp     x29, x19, [sp, #-16]!
        mov     x29, sp
+#endif /* !CONFIG_AMLOGIC_VMAP */
 
 9998:
        .endm
@@ -447,6 +465,26 @@ ENDPROC(el1_error_invalid)
  */
        .align  6
 el1_sync:
+#ifdef CONFIG_AMLOGIC_VMAP
+       /*
+        * register using:
+        * DBGWVR2_EL1: temp register and back up for sp_el1 of exception
+        * DBGWVR3_EL1: always point to per-cpu vmap stack
+        * switch sp_el1 to per-cpu vmap stack and using DBGWVR2_EL1
+        * to back up sp_el1 under exception
+        */
+       msr     DBGWVR2_EL1, x29
+       mrs     x29, DBGWVR3_EL1
+       sub     x29, x29, #S_FRAME_SIZE
+       msr     DBGWVR3_EL1, x29
+       stp     x19, x20, [x29]
+       mov     x19, sp
+       mrs     x20, DBGWVR2_EL1
+       msr     DBGWVR2_EL1, x19
+       mov     sp,  x29
+       mov     x29, x20
+       ldp     x19, x20, [sp]
+#endif /* CONFIG_AMLOGIC_VMAP */
        kernel_entry 1
        mrs     x1, esr_el1                     // read the syndrome register
        lsr     x24, x1, #ESR_ELx_EC_SHIFT      // exception class
@@ -474,6 +512,38 @@ el1_da:
        /*
         * Data abort handling
         */
+#ifdef CONFIG_AMLOGIC_VMAP
+       /*
+        * first handle vmap page fault, if result is not ok(eg, fault address
+        * is not in vmap range), then do normal data abort
+        */
+       mrs     x0, far_el1
+       mov     x2, sp
+       stp     x29, x30, [sp, #-16]!   /* add a stack frame for backtrace */
+       mov     x29, sp
+       bl      handle_vmap_fault
+       ldp     x29, x22, [sp], #16
+       cmp     x0, #0
+       b.ne    888888f
+       kernel_exit 1, 1                /* exit for vmap fault */
+888888:
+       /*
+        * Not a vmap fault, copy context saved in per-cpu vmap stack
+        * to task stack, then switch stack back to task stack under
+        * exception
+        */
+       mrs     x0, DBGWVR2_EL1
+       mov     x1, sp
+       mov     x2, #S_FRAME_SIZE
+       bl      memcpy
+       add     x17, x0, #S_FRAME_SIZE
+       str     x17, [x0, #S_SP]
+       mrs     x1, esr_el1     /* rebuild parameter for normal handler */
+       mrs     x18, DBGWVR3_EL1
+       add     x18, x18, #S_FRAME_SIZE
+       msr     DBGWVR3_EL1, x18
+       mov     sp, x0
+#endif /* CONFIG_AMLOGIC_VMAP */
        mrs     x3, far_el1
        enable_dbg
        // re-enable interrupts if they were enabled in the aborted context
@@ -503,6 +573,20 @@ el1_undef:
        mov     x0, sp
        b       do_undefinstr
 el1_dbg:
+#ifdef CONFIG_AMLOGIC_VMAP
+       /* switch back to task stack pointer */
+       mrs     x0, DBGWVR2_EL1
+       mov     x1, sp
+       mov     x2, #S_FRAME_SIZE
+       bl      memcpy
+       add     x17, x0, #S_FRAME_SIZE
+       str     x17, [x0, #S_SP]
+       mrs     x1, esr_el1     /* rebuild parameter for normal handler */
+       mrs     x18, DBGWVR3_EL1
+       add     x18, x18, #S_FRAME_SIZE
+       msr     DBGWVR3_EL1, x18
+       mov     sp, x0
+#endif /* CONFIG_AMLOGIC_VMAP */
        /*
         * Debug exception handling
         */
@@ -527,7 +611,36 @@ ENDPROC(el1_sync)
 
        .align  6
 el1_irq:
+#ifdef CONFIG_AMLOGIC_VMAP
+       /* switch stack to avoid ELR lost if el1_da
+        * happen when saving context
+        */
+       msr     DBGWVR2_EL1, x29
+       mrs     x29, DBGWVR3_EL1
+       sub     x29, x29, #S_FRAME_SIZE
+       msr     DBGWVR3_EL1, x29
+       stp     x19, x20, [x29]
+       mov     x19, sp
+       mrs     x20, DBGWVR2_EL1
+       msr     DBGWVR2_EL1, x19
+       mov     sp,  x29
+       mov     x29, x20
+       ldp     x19, x20, [sp]
+#endif /* CONFIG_AMLOGIC_VMAP */
        kernel_entry 1
+#ifdef CONFIG_AMLOGIC_VMAP
+       /* switch back to task stack pointer */
+       mrs     x0, DBGWVR2_EL1
+       mov     x1, sp
+       mov     x2, #S_FRAME_SIZE
+       mov     sp, x0
+       bl      memcpy
+       add     x17, sp, #S_FRAME_SIZE
+       str     x17, [sp, #S_SP]
+       mrs     x18, DBGWVR3_EL1
+       add     x18, x18, #S_FRAME_SIZE
+       msr     DBGWVR3_EL1, x18
+#endif /* CONFIG_AMLOGIC_VMAP */
        enable_dbg
 #ifdef CONFIG_TRACE_IRQFLAGS
        bl      trace_hardirqs_off
@@ -1035,3 +1148,20 @@ ENTRY(sys_rt_sigreturn_wrapper)
        mov     x0, sp
        b       sys_rt_sigreturn
 ENDPROC(sys_rt_sigreturn_wrapper)
+
+#ifdef CONFIG_AMLOGIC_VMAP
+ENTRY(__setup_vmap_stack)
+       ldr             x18, =vmap_stack
+       add             x18, x18, x0
+       mov             x0, x18
+       mov             x1, #0
+       mov             x2, #THREAD_SIZE
+       mov             x17, lr
+       bl              memset                  /* clear stack buffer */
+       mov             lr, x17
+       mov             x0, #THREAD_START_SP
+       add             x18, x18, x0            /* set stack top */
+       msr             DBGWVR3_EL1, x18
+       ret
+ENDPROC(__setup_vmap_stack)
+#endif /* CONFIG_AMLOGIC_VMAP */
index fb0082a..0798abd 100644 (file)
@@ -133,6 +133,11 @@ NOKPROBE_SYMBOL(read_wb_reg);
 
 static void write_wb_reg(int reg, int n, u64 val)
 {
+#ifdef CONFIG_AMLOGIC_VMAP
+       /* avoid write DBGWVR since we use it for special purpose */
+       if (reg >= AARCH64_DBG_REG_WVR && reg < AARCH64_DBG_REG_WCR)
+               return;
+#endif
        switch (reg + n) {
        GEN_WRITE_WB_REG_CASES(AARCH64_DBG_REG_BVR, AARCH64_DBG_REG_NAME_BVR, val);
        GEN_WRITE_WB_REG_CASES(AARCH64_DBG_REG_BCR, AARCH64_DBG_REG_NAME_BCR, val);
index 4097031..0185f89 100644 (file)
 #include <asm/perf_event.h>
 #endif
 
+#ifdef CONFIG_AMLOGIC_VMAP
+#include <linux/amlogic/vmap_stack.h>
+#endif
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/ipi.h>
 
@@ -226,6 +230,9 @@ asmlinkage void secondary_start_kernel(void)
 
        cpu = task_cpu(current);
        set_my_cpu_offset(per_cpu_offset(cpu));
+#ifdef CONFIG_AMLOGIC_VMAP
+       __setup_vmap_stack(my_cpu_offset);
+#endif
 
        /*
         * All kernel threads share the same mm context; grab a
@@ -446,6 +453,9 @@ void __init smp_cpus_done(unsigned int max_cpus)
 void __init smp_prepare_boot_cpu(void)
 {
        set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
+#ifdef CONFIG_AMLOGIC_VMAP
+       __setup_vmap_stack(my_cpu_offset);
+#endif
        /*
         * Initialise the static keys early as they may be enabled by the
         * cpufeature code.
index 5201beb..28dee26 100644 (file)
 #include <asm/stack_pointer.h>
 #include <asm/stacktrace.h>
 
+#ifdef CONFIG_AMLOGIC_VMAP
+#include <linux/amlogic/vmap_stack.h>
+#endif
+
 /*
  * AArch64 PCS assigns the frame pointer to x29.
  *
@@ -117,6 +121,15 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
                        return -EINVAL;
                }
        }
+#ifdef CONFIG_AMLOGIC_VMAP
+       /*
+        * keep search stack for task
+        */
+       if (on_vmap_stack(frame->sp, raw_smp_processor_id()) &&
+           !on_vmap_stack(frame->fp, raw_smp_processor_id())) {
+               frame->sp = frame->fp;
+       }
+#endif
 
        return 0;
 }
index b2b036b..6e2d130 100644 (file)
@@ -97,6 +97,21 @@ static void dump_mem(const char *lvl, const char *str, unsigned long bottom,
        set_fs(fs);
 }
 
+#ifdef CONFIG_AMLOGIC_VMAP
+static void dump_backtrace_entry(unsigned long ip, unsigned long fp)
+{
+       unsigned long fp_size = 0;
+
+       if (fp >= VMALLOC_START) {
+               fp_size = *((unsigned long *)fp) - fp;
+               /* fp cross IRQ or vmap stack */
+               if (fp_size >= THREAD_SIZE)
+                       fp_size = 0;
+       }
+       printk("[%016lx+%4ld][<%p>] %pS\n",
+               fp, fp_size, (void *) ip, (void *) ip);
+}
+#else
 static void dump_backtrace_entry(unsigned long where)
 {
        /*
@@ -104,6 +119,7 @@ static void dump_backtrace_entry(unsigned long where)
         */
        print_ip_sym(where);
 }
+#endif
 
 static void __dump_instr(const char *lvl, struct pt_regs *regs)
 {
@@ -186,7 +202,11 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 
                /* skip until specified stack frame */
                if (!skip) {
+               #ifdef CONFIG_AMLOGIC_VMAP
+                       dump_backtrace_entry(where, frame.fp);
+               #else
                        dump_backtrace_entry(where);
+               #endif
                } else if (frame.fp == regs->regs[29]) {
                        skip = 0;
                        /*
@@ -196,7 +216,11 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
                         * at which an exception has taken place, use regs->pc
                         * instead.
                         */
+               #ifdef CONFIG_AMLOGIC_VMAP
+                       dump_backtrace_entry(regs->pc, frame.fp);
+               #else
                        dump_backtrace_entry(regs->pc);
+               #endif
                }
                ret = unwind_frame(tsk, &frame);
                if (ret < 0)
index cef724a..5b6f2fc 100644 (file)
@@ -39,6 +39,17 @@ config AMLOGIC_CMA
                Amlogic CMA optimization for cma alloc/free problems
                Including policy change of CMA usage
 
+config AMLOGIC_VMAP
+       bool "Amlogic kernel stack"
+       depends on AMLOGIC_MEMORY_EXTEND
+       depends on 64BIT
+       default y
+       help
+               This config is used to enable amlogic kernel stack
+               usage optimization with vmalloc. It depends on
+               AMLOGIC_MEMORY_EXTEND. This config only opened
+               on 64 bit platform.
+
 config AMLOGIC_SLUB_DEBUG
        bool "Amlogic debug for trace all slub objects"
        depends on AMLOGIC_PAGE_TRACE
index f3c1216..8d3c266 100644 (file)
@@ -3,3 +3,4 @@ obj-$(CONFIG_AMLOGIC_PAGE_TRACE)  += page_trace.o
 obj-$(CONFIG_AMLOGIC_CMA)         += aml_cma.o
 obj-$(CONFIG_AMLOGIC_SLUB_DEBUG)  += aml_slub_debug.o
 obj-$(CONFIG_AMLOGIC_RAMDUMP)     += ram_dump.o
+obj-$(CONFIG_AMLOGIC_VMAP)        += vmap_stack.o
diff --git a/drivers/amlogic/memory_ext/vmap_stack.c b/drivers/amlogic/memory_ext/vmap_stack.c
new file mode 100644 (file)
index 0000000..687a1a6
--- /dev/null
@@ -0,0 +1,536 @@
+/*
+ * drivers/amlogic/memory_ext/vmap_stack.c
+ *
+ * Copyright (C) 2017 Amlogic, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/platform_device.h>
+#include <linux/mutex.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/reboot.h>
+#include <linux/memblock.h>
+#include <linux/vmalloc.h>
+#include <linux/arm-smccc.h>
+#include <linux/memcontrol.h>
+#include <linux/amlogic/vmap_stack.h>
+#include <linux/highmem.h>
+#include <asm/tlbflush.h>
+#include <asm/stacktrace.h>
+
+#define DEBUG                                                  0
+
+#define D(format, args...)                                     \
+       { if (DEBUG)                                            \
+               pr_info("VMAP:%s "format, __func__, ##args);    \
+       }
+
+#define E(format, args...)     pr_err("VMAP:%s "format, __func__, ##args)
+
+static unsigned long stack_shrink_jiffies;
+static unsigned char vmap_shrink_enable;
+static atomic_t vmap_stack_size;
+static struct aml_vmap *avmap;
+
+DEFINE_PER_CPU(unsigned long [THREAD_SIZE/sizeof(long)], vmap_stack)
+       __aligned(16);
+
+void update_vmap_stack(int diff)
+{
+       atomic_add(diff, &vmap_stack_size);
+}
+EXPORT_SYMBOL(update_vmap_stack);
+
+int get_vmap_stack_size(void)
+{
+       return atomic_read(&vmap_stack_size);
+}
+EXPORT_SYMBOL(get_vmap_stack_size);
+
+static int is_vmap_addr(unsigned long addr)
+{
+       unsigned long start, end;
+
+       start = (unsigned long)avmap->root_vm->addr;
+       end   = (unsigned long)avmap->root_vm->addr + avmap->root_vm->size;
+       if ((addr >= start) && (addr < end))
+               return 1;
+       else
+               return 0;
+}
+
+static struct page *get_vmap_cached_page(int *remain)
+{
+       unsigned long flags;
+       struct page *page;
+
+       spin_lock_irqsave(&avmap->page_lock, flags);
+       if (unlikely(!avmap->cached_pages)) {
+               spin_unlock_irqrestore(&avmap->page_lock, flags);
+               return NULL;
+       }
+       page = list_first_entry(&avmap->list, struct page, lru);
+       list_del(&page->lru);
+       avmap->cached_pages--;
+       *remain = avmap->cached_pages;
+       spin_unlock_irqrestore(&avmap->page_lock, flags);
+
+       return page;
+}
+
+static int vmap_mmu_set(struct page *page, unsigned long addr, int set)
+{
+       pgd_t *pgd = NULL;
+       pud_t *pud = NULL;
+       pmd_t *pmd = NULL;
+       pte_t *pte = NULL;
+
+       pgd = pgd_offset_k(addr);
+       pud = pud_alloc(&init_mm, pgd, addr);
+       if (!pud)
+               goto nomem;
+
+       if (pud_none(*pud)) {
+               pmd = pmd_alloc(&init_mm, pud, addr);
+               if (!pmd)
+                       goto nomem;
+       }
+
+       pmd = pmd_offset(pud, addr);
+       if (pmd_none(*pmd)) {
+               pte = pte_alloc_kernel(pmd, addr);
+               if (!pte)
+                       goto nomem;
+       }
+
+       pte = pte_offset_map(pmd, addr);
+       if (set)
+               set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
+       else
+               pte_clear(&init_mm, addr, pte);
+       pte_unmap(pte);
+       flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+       D("add:%lx, pgd:%p %llx, pmd:%p %llx, pte:%p %llx\n",
+               addr, pgd, pgd_val(*pgd), pmd, pmd_val(*pmd),
+               pte, pte_val(*pte));
+       return 0;
+nomem:
+       E("allocation page talbe failed, G:%p, U:%p, M:%p, T:%p",
+               pgd, pud, pmd, pte);
+       return -ENOMEM;
+}
+
+static int stack_floor_page(unsigned long addr)
+{
+       /*
+        * stack address must align to THREAD_SIZE
+        */
+       return ((addr & (THREAD_SIZE - 1)) < PAGE_SIZE);
+}
+
+static int check_addr_up_flow(unsigned long addr)
+{
+       /*
+        * It's the first page of 4 contigours virtual address
+        * rage(aligned to THREAD_SIZE) but next page of this
+        * addr is not mapped
+        */
+       if (stack_floor_page(addr) &&
+           !vmalloc_to_page((const void *)(addr + PAGE_SIZE)))
+               return 1;
+       return 0;
+}
+
+#if DEBUG
+static void dump_backtrace_entry(unsigned long ip, unsigned long fp)
+{
+       unsigned long fp_size = 0;
+
+       if (fp >= VMALLOC_START) {
+               fp_size = *((unsigned long *)fp) - fp;
+               /* fp cross IRQ or vmap stack */
+               if (fp_size >= THREAD_SIZE)
+                       fp_size = 0;
+       }
+       pr_info("[%016lx+%4ld][<%p>] %pS\n",
+               fp, fp_size, (void *) ip, (void *) ip);
+}
+
+static void show_fault_stack(unsigned long addr, struct pt_regs *regs)
+{
+       struct stackframe frame;
+
+       frame.fp = regs->regs[29];
+       frame.sp = addr;
+       frame.pc = (unsigned long)regs->regs[30];
+
+       pr_info("Call trace:\n");
+       pr_info("[%016lx+%4ld][<%p>] %pS\n",
+               addr, frame.fp - addr, (void *)regs->pc, (void *) regs->pc);
+       while (1) {
+               int ret;
+
+               dump_backtrace_entry(frame.pc, frame.fp);
+               ret = unwind_frame(current, &frame);
+               if (ret < 0)
+                       break;
+       }
+}
+#endif
+
+/*
+ * IRQ should *NEVER* been opened in this handler
+ */
+int handle_vmap_fault(unsigned long addr, unsigned int esr,
+                     struct pt_regs *regs)
+{
+       struct page *page;
+       int cache = 0;
+
+       if (!is_vmap_addr(addr))
+               return -EINVAL;
+
+       D("addr:%lx, esr:%x, task:%5d %s\n",
+               addr, esr, current->pid, current->comm);
+       D("pc:%pf, %llx, lr:%pf, %llx, sp:%llx, %lx\n",
+               (void *)regs->pc, regs->pc,
+               (void *)regs->regs[30], regs->regs[30], regs->sp,
+               current_stack_pointer);
+
+       if (check_addr_up_flow(addr)) {
+               E("address %lx out of range\n", addr);
+               E("PC is:%llx, %pf, LR is:%llx %pf\n",
+                       regs->pc, (void *)regs->pc,
+                       regs->regs[30], (void *)regs->regs[30]);
+               E("task:%d %s, stack:%p, %lx\n",
+                       current->pid, current->comm, current->stack,
+                       current_stack_pointer);
+               dump_stack();
+               return -ERANGE;
+       }
+
+       /*
+        * allocate a new page for vmap
+        */
+       page = get_vmap_cached_page(&cache);
+       WARN_ON(!page);
+       vmap_mmu_set(page, addr, 1);
+       update_vmap_stack(1);
+       if ((THREAD_SIZE_ORDER  > 1) && stack_floor_page(addr)) {
+               E("task:%d %s, stack near overflow, addr:%lx\n",
+                       current->pid, current->comm, addr);
+               dump_stack();
+       }
+
+       /* cache is not enough */
+       if (cache <= (VMAP_CACHE_PAGE / 2))
+               mod_delayed_work(system_highpri_wq, &avmap->mwork, 0);
+
+       D("map page:%5lx for addr:%lx\n", page_to_pfn(page), addr);
+#if DEBUG
+       show_fault_stack(addr, regs);
+#endif
+
+       return 0;
+}
+EXPORT_SYMBOL(handle_vmap_fault);
+
+static unsigned long vmap_shrink_count(struct shrinker *s,
+                                 struct shrink_control *sc)
+{
+       return global_page_state(NR_KERNEL_STACK_KB);
+}
+
+static int shrink_vm_stack(unsigned long low, unsigned long high)
+{
+       int pages = 0;
+       struct page *page;
+
+       for (; low < (high & PAGE_MASK); low += PAGE_SIZE) {
+               page = vmalloc_to_page((const void *)low);
+               vmap_mmu_set(page, low, 0);
+               update_vmap_stack(-1);
+               __free_page(page);
+               pages++;
+       }
+       return pages;
+}
+
+static unsigned long get_task_stack_floor(unsigned long sp)
+{
+       unsigned long end;
+
+       end = sp & (THREAD_SIZE - 1);
+       while (sp > end) {
+               if (!vmalloc_to_page((const void *)sp))
+                       break;
+               sp -= PAGE_SIZE;
+       }
+       return PAGE_ALIGN(sp);
+}
+
+static unsigned long vmap_shrink_scan(struct shrinker *s,
+                                     struct shrink_control *sc)
+{
+       struct task_struct *tsk;
+       unsigned long thread_sp;
+       unsigned long stack_floor;
+       unsigned long rem = 0;
+
+       if (!vmap_shrink_enable)
+               return 0;
+
+       /*
+        * sleep for a while if shrink too ofen
+        */
+       if (jiffies - stack_shrink_jiffies <= STACK_SHRINK_SLEEP)
+               return 0;
+
+       rcu_read_lock();
+       for_each_process(tsk) {
+               thread_sp = thread_saved_sp(tsk);
+               stack_floor = get_task_stack_floor(thread_sp);
+               /*
+                * Make sure selected task is sleeping
+                */
+               D("r:%3ld, sp:[%lx-%lx], s:%5ld, tsk:%lx %d %s\n",
+                       rem, thread_sp, stack_floor,
+                       thread_sp - stack_floor,
+                       tsk->state, tsk->pid, tsk->comm);
+               task_lock(tsk);
+               if (tsk->state == TASK_RUNNING) {
+                       task_unlock(tsk);
+                       continue;
+               }
+               if (thread_sp - stack_floor >= STACK_SHRINK_THRESHOLD)
+                       rem += shrink_vm_stack(stack_floor, thread_sp);
+               task_unlock(tsk);
+       }
+       rcu_read_unlock();
+       stack_shrink_jiffies = jiffies;
+
+       return rem;
+}
+
+static struct shrinker vmap_shrinker = {
+       .scan_objects = vmap_shrink_scan,
+       .count_objects = vmap_shrink_count,
+       .seeks = DEFAULT_SEEKS * 16
+};
+
+/* FOR debug */
+static unsigned long vmap_debug_jiff;
+
+void aml_account_task_stack(struct task_struct *tsk, int account)
+{
+       unsigned long stack = (unsigned long)task_stack_page(tsk);
+       struct page *first_page;
+
+       stack += STACK_TOP_PAGE_OFF;
+       first_page = vmalloc_to_page((void *)stack);
+       mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
+                           THREAD_SIZE / 1024 * account);
+
+       memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
+                                   account * (THREAD_SIZE / 1024));
+       if (time_after(jiffies, vmap_debug_jiff + HZ * 5)) {
+               vmap_debug_jiff = jiffies;
+               D("KERNEL_STACK:%ld KB, vmap stack:%d KB, cached:%d KB\n",
+                       global_page_state(NR_KERNEL_STACK_KB),
+                       get_vmap_stack_size() << (PAGE_SHIFT - 10),
+                       avmap->cached_pages << (PAGE_SHIFT - 10));
+       }
+}
+
+void *aml_stack_alloc(int node, struct task_struct *tsk)
+{
+       unsigned long bitmap_no, raw_start;
+       struct page *page;
+       unsigned long addr, map_addr, flags;
+
+       spin_lock_irqsave(&avmap->vmap_lock, flags);
+       raw_start = avmap->start_bit;
+       bitmap_no = find_next_zero_bit(avmap->bitmap, MAX_TASKS,
+                                      avmap->start_bit);
+       avmap->start_bit = bitmap_no + 1; /* next idle address space */
+       if (bitmap_no >= MAX_TASKS) {
+               spin_unlock_irqrestore(&avmap->vmap_lock, flags);
+               E("BITMAP FULL!!!\n");
+               return NULL;
+       }
+       bitmap_set(avmap->bitmap, bitmap_no, 1);
+       spin_unlock_irqrestore(&avmap->vmap_lock, flags);
+
+       page = alloc_page(THREADINFO_GFP | __GFP_ZERO);
+       if (!page) {
+               spin_lock_irqsave(&avmap->vmap_lock, flags);
+               bitmap_clear(avmap->bitmap, bitmap_no, 1);
+               spin_unlock_irqrestore(&avmap->vmap_lock, flags);
+               E("alloction page failed\n");
+               return NULL;
+       }
+       /*
+        * map first page only
+        */
+       addr = (unsigned long)avmap->root_vm->addr + THREAD_SIZE * bitmap_no;
+       map_addr = addr + STACK_TOP_PAGE_OFF;
+       vmap_mmu_set(page, map_addr, 1);
+       update_vmap_stack(1);
+       D("bit idx:%5ld, start:%5ld, addr:%lx, page:%lx\n",
+               bitmap_no, raw_start, addr, page_to_pfn(page));
+
+       return (void *)addr;
+}
+
+void aml_stack_free(struct task_struct *tsk)
+{
+       unsigned long stack = (unsigned long)tsk->stack;
+       unsigned long addr, bitmap_no;
+       struct page *page;
+       unsigned long flags;
+
+       addr = stack + STACK_TOP_PAGE_OFF;
+       for (; addr >= stack; addr -= PAGE_SIZE) {
+               page = vmalloc_to_page((const void *)addr);
+               if (!page)
+                       break;
+               vmap_mmu_set(page, addr, 0);
+               /* supplement for stack page cache first */
+               spin_lock_irqsave(&avmap->page_lock, flags);
+               if (avmap->cached_pages < VMAP_CACHE_PAGE) {
+                       list_add_tail(&page->lru, &avmap->list);
+                       avmap->cached_pages++;
+                       spin_unlock_irqrestore(&avmap->page_lock, flags);
+                       clear_highpage(page);   /* clear for next use */
+               } else {
+                       spin_unlock_irqrestore(&avmap->page_lock, flags);
+                       __free_page(page);
+               }
+               update_vmap_stack(-1);
+       }
+       bitmap_no = (stack - (unsigned long)avmap->root_vm->addr) / THREAD_SIZE;
+       spin_lock_irqsave(&avmap->vmap_lock, flags);
+       bitmap_clear(avmap->bitmap, bitmap_no, 1);
+       if (bitmap_no < avmap->start_bit)
+               avmap->start_bit = bitmap_no;
+       spin_unlock_irqrestore(&avmap->vmap_lock, flags);
+}
+
+static void page_cache_maintain_work(struct work_struct *work)
+{
+       struct page *page;
+       struct list_head head;
+       int i, cnt;
+       unsigned long flags;
+
+       spin_lock_irqsave(&avmap->page_lock, flags);
+       cnt = avmap->cached_pages;
+       spin_unlock_irqrestore(&avmap->page_lock, flags);
+       if (cnt >= VMAP_CACHE_PAGE) {
+               D("cache full cnt:%d\n", cnt);
+               schedule_delayed_work(&avmap->mwork, CACHE_MAINTAIN_DELAY);
+               return;
+       }
+
+       INIT_LIST_HEAD(&head);
+       for (i = 0; i < VMAP_CACHE_PAGE - cnt; i++) {
+               page = alloc_page(GFP_KERNEL | __GFP_HIGH);
+               if (!page) {
+                       E("get page failed, allocated:%d, cnt:%d\n", i, cnt);
+                       break;
+               }
+               list_add(&page->lru, &head);
+       }
+       spin_lock_irqsave(&avmap->page_lock, flags);
+       list_splice(&head, &avmap->list);
+       avmap->cached_pages += i;
+       spin_unlock_irqrestore(&avmap->page_lock, flags);
+       D("add %d pages, cnt:%d\n", i, cnt);
+       schedule_delayed_work(&avmap->mwork, CACHE_MAINTAIN_DELAY);
+}
+
+int __init start_thread_work(void)
+{
+       schedule_delayed_work(&avmap->mwork, CACHE_MAINTAIN_DELAY);
+       return 0;
+}
+arch_initcall(start_thread_work);
+
+void __init thread_stack_cache_init(void)
+{
+       int i;
+       unsigned long addr;
+       struct page *page;
+
+       page = alloc_pages(GFP_KERNEL, VMAP_CACHE_PAGE_ORDER);
+       if (!page)
+               return;
+
+       avmap = kzalloc(sizeof(struct aml_vmap), GFP_KERNEL);
+       if (!avmap) {
+               __free_pages(page, VMAP_CACHE_PAGE_ORDER);
+               return;
+       }
+
+       avmap->bitmap = kzalloc(MAX_TASKS / 8, GFP_KERNEL);
+       if (!avmap->bitmap) {
+               __free_pages(page, VMAP_CACHE_PAGE_ORDER);
+               kfree(avmap);
+               return;
+       }
+       pr_info("%s, vmap:%p, bitmap:%p, cache page:%lx\n",
+               __func__, avmap, avmap->bitmap, page_to_pfn(page));
+       avmap->root_vm = __get_vm_area_node(VM_STACK_AREA_SIZE,
+                                           VM_STACK_AREA_SIZE,
+                                           0, VMALLOC_START, VMALLOC_END,
+                                           NUMA_NO_NODE, GFP_KERNEL,
+                                           __builtin_return_address(0));
+       if (!avmap->root_vm) {
+               __free_pages(page, VMAP_CACHE_PAGE_ORDER);
+               kfree(avmap->bitmap);
+               kfree(avmap);
+               return;
+       }
+       pr_info("%s, allocation vm area:%p, addr:%p, size:%lx\n", __func__,
+               avmap->root_vm, avmap->root_vm->addr,
+               avmap->root_vm->size);
+
+       INIT_LIST_HEAD(&avmap->list);
+       spin_lock_init(&avmap->page_lock);
+       spin_lock_init(&avmap->vmap_lock);
+
+       for (i = 0; i < VMAP_CACHE_PAGE; i++) {
+               list_add(&page->lru, &avmap->list);
+               page++;
+       }
+       avmap->cached_pages = VMAP_CACHE_PAGE;
+       INIT_DELAYED_WORK(&avmap->mwork, page_cache_maintain_work);
+
+       for_each_possible_cpu(i) {
+               addr = (unsigned long)per_cpu_ptr(vmap_stack, i);
+               pr_info("cpu %d, vmap_stack:[%lx-%lx]\n",
+                       i, addr, addr + THREAD_START_SP);
+               addr = (unsigned long)per_cpu_ptr(irq_stack, i);
+               pr_info("cpu %d, irq_stack: [%lx-%lx]\n",
+                       i, addr, addr + THREAD_START_SP);
+       }
+       register_shrinker(&vmap_shrinker);
+}
index 8228001..dbf9151 100644 (file)
@@ -42,6 +42,9 @@
 #include <linux/kobject.h>
 #include <../kernel/power/power.h>
 #include <linux/amlogic/scpi_protocol.h>
+#ifdef CONFIG_AMLOGIC_VMAP
+#include <linux/amlogic/vmap_stack.h>
+#endif
 
 typedef unsigned long (psci_fn)(unsigned long, unsigned long,
                                unsigned long, unsigned long);
@@ -82,6 +85,9 @@ static void meson_gx_suspend(void)
 /*     cpu_suspend(0, meson_system_suspend);
  */
        pr_info("... wake up\n");
+#ifdef CONFIG_AMLOGIC_VMAP
+       __setup_vmap_stack(my_cpu_offset);
+#endif
 }
 
 static int meson_pm_prepare(void)
index 8a42849..58bd174 100644 (file)
@@ -15,6 +15,9 @@
 #ifdef CONFIG_CMA
 #include <linux/cma.h>
 #endif
+#ifdef CONFIG_AMLOGIC_VMAP
+#include <linux/amlogic/vmap_stack.h>
+#endif
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include "internal.h"
@@ -153,6 +156,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
        show_val_kb(m, "CmaFree:        ",
                    global_page_state(NR_FREE_CMA_PAGES));
 #endif
+#ifdef CONFIG_AMLOGIC_VMAP
+       show_val_kb(m, "VmapStack:      ", get_vmap_stack_size());
+#endif
 
        hugetlb_report_meminfo(m);
 
diff --git a/include/linux/amlogic/vmap_stack.h b/include/linux/amlogic/vmap_stack.h
new file mode 100644 (file)
index 0000000..8f7a36f
--- /dev/null
@@ -0,0 +1,66 @@
+/*
+ * include/linux/amlogic/vmap_stack.h
+ *
+ * Copyright (C) 2017 Amlogic, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#ifndef __VMAP_STACK_H__
+#define __VMAP_STACK_H__
+
+#define STACK_SHRINK_THRESHOLD         (PAGE_SIZE + 1024)
+#define STACK_SHRINK_SLEEP             (HZ)
+#define VM_STACK_AREA_SIZE             SZ_512M
+
+#define STACK_TOP_PAGE_OFF             (THREAD_SIZE - PAGE_SIZE)
+
+#define MAX_TASKS                      (VM_STACK_AREA_SIZE / THREAD_SIZE)
+
+#define VMAP_PAGE_FLAG                 (__GFP_ZERO | __GFP_HIGH |\
+                                        __GFP_ATOMIC | __GFP_REPEAT)
+
+#define VMAP_CACHE_PAGE_ORDER          6
+#define VMAP_CACHE_PAGE                        (1 << VMAP_CACHE_PAGE_ORDER)
+#define CACHE_MAINTAIN_DELAY           (HZ)
+
+struct aml_vmap {
+       unsigned int start_bit;
+       int cached_pages;
+       struct vm_struct *root_vm;
+       unsigned long *bitmap;
+       struct list_head list;
+       spinlock_t vmap_lock;
+       spinlock_t page_lock;
+       struct delayed_work mwork;
+};
+
+extern int handle_vmap_fault(unsigned long addr,
+                            unsigned int esr, struct pt_regs *regs);
+
+extern DEFINE_PER_CPU(unsigned long [THREAD_SIZE/sizeof(long)], vmap_stack);
+static inline bool on_vmap_stack(unsigned long sp, int cpu)
+{
+       /* variable names the same as kernel/stacktrace.c */
+       unsigned long low = (unsigned long)per_cpu(vmap_stack, cpu);
+       unsigned long high = low + THREAD_START_SP;
+
+       return (low <= sp && sp <= high);
+}
+
+extern void  __setup_vmap_stack(unsigned long off);
+extern void  update_vmap_stack(int diff);
+extern int   get_vmap_stack_size(void);
+extern void  aml_stack_free(struct task_struct *tsk);
+extern void *aml_stack_alloc(int node, struct task_struct *tsk);
+extern void  aml_account_task_stack(struct task_struct *tsk, int account);
+#endif /* __VMAP_STACK_H__ */
index 3e3f559..9ab2bf1 100644 (file)
@@ -3377,8 +3377,12 @@ static inline void *try_get_task_stack(struct task_struct *tsk)
 static inline void put_task_stack(struct task_struct *tsk) {}
 #endif
 
+#ifdef CONFIG_AMLOGIC_VMAP
+#define task_stack_end_corrupted(task) (false)
+#else
 #define task_stack_end_corrupted(task) \
                (*(end_of_stack(task)) != STACK_END_MAGIC)
+#endif
 
 static inline int object_is_on_stack(void *obj)
 {
index 3d9d786..2f4fc62 100644 (file)
@@ -192,4 +192,10 @@ pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
 int register_vmap_purge_notifier(struct notifier_block *nb);
 int unregister_vmap_purge_notifier(struct notifier_block *nb);
 
+#ifdef CONFIG_AMLOGIC_VMAP
+extern struct vm_struct *__get_vm_area_node(unsigned long size,
+               unsigned long align, unsigned long flags, unsigned long start,
+               unsigned long end, int node,
+               gfp_t gfp_mask, const void *caller);
+#endif
 #endif /* _LINUX_VMALLOC_H */
index 24ce22c..2322446 100644 (file)
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#ifdef CONFIG_AMLOGIC_VMAP
+#include <linux/amlogic/vmap_stack.h>
+#endif
+
 #include <trace/events/sched.h>
 
 #define CREATE_TRACE_POINTS
@@ -206,15 +210,22 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
                tsk->stack_vm_area = find_vm_area(stack);
        return stack;
 #else
+#ifdef CONFIG_AMLOGIC_VMAP
+       return aml_stack_alloc(node, tsk);
+#else /* CONFIG_AMLOGIC_VMAP */
        struct page *page = alloc_pages_node(node, THREADINFO_GFP,
                                             THREAD_SIZE_ORDER);
 
        return page ? page_address(page) : NULL;
+#endif /* CONFIG_AMLOGIC_VMAP */
 #endif
 }
 
 static inline void free_thread_stack(struct task_struct *tsk)
 {
+#ifdef CONFIG_AMLOGIC_VMAP
+       aml_stack_free(tsk);
+#else /* CONFIG_AMLOGIC_VMAP */
        kaiser_unmap_thread_stack(tsk->stack);
 #ifdef CONFIG_VMAP_STACK
        if (task_stack_vm_area(tsk)) {
@@ -238,6 +249,7 @@ static inline void free_thread_stack(struct task_struct *tsk)
 #endif
 
        __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
+#endif /* CONFIG_AMLOGIC_VMAP */
 }
 # else
 static struct kmem_cache *thread_stack_cache;
@@ -282,6 +294,9 @@ static struct kmem_cache *mm_cachep;
 
 static void account_kernel_stack(struct task_struct *tsk, int account)
 {
+#ifdef CONFIG_AMLOGIC_VMAP
+       aml_account_task_stack(tsk, account);
+#else
        void *stack = task_stack_page(tsk);
        struct vm_struct *vm = task_stack_vm_area(tsk);
 
@@ -314,6 +329,7 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
                memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
                                            account * (THREAD_SIZE / 1024));
        }
+#endif /* CONFIG_AMLOGIC_VMAP*/
 }
 
 static void release_task_stack(struct task_struct *tsk)
@@ -465,12 +481,23 @@ int __weak arch_dup_task_struct(struct task_struct *dst,
        return 0;
 }
 
+#ifdef CONFIG_AMLOGIC_VMAP
+static bool first_magic __read_mostly;
+#endif
+
 void set_task_stack_end_magic(struct task_struct *tsk)
 {
        unsigned long *stackend;
 
        stackend = end_of_stack(tsk);
+#ifdef CONFIG_AMLOGIC_VMAP
+       if (unlikely(!first_magic)) {
+               *stackend = STACK_END_MAGIC;    /* for overflow detection */
+               first_magic = 1;
+       }
+#else
        *stackend = STACK_END_MAGIC;    /* for overflow detection */
+#endif
 }
 
 static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
index 195de42..1323e1a 100644 (file)
@@ -1361,9 +1361,15 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm)
        vm->flags &= ~VM_UNINITIALIZED;
 }
 
+#ifdef CONFIG_AMLOGIC_VMAP
+struct vm_struct *__get_vm_area_node(unsigned long size,
+               unsigned long align, unsigned long flags, unsigned long start,
+               unsigned long end, int node, gfp_t gfp_mask, const void *caller)
+#else
 static struct vm_struct *__get_vm_area_node(unsigned long size,
                unsigned long align, unsigned long flags, unsigned long start,
                unsigned long end, int node, gfp_t gfp_mask, const void *caller)
+#endif
 {
        struct vmap_area *va;
        struct vm_struct *area;