From cef00787eb669aa6276a8d684ed6effc7a24cf44 Mon Sep 17 00:00:00 2001 From: Ekaterina Gorelkina Date: Thu, 15 Jul 2010 17:33:55 +0400 Subject: [PATCH] Fix kernel dump during app instrumentation for Tegra 250 (multi-core ARM) --- kprobe/kprobes.c | 286 +++++++++++++++++++++++++++++++++++++++++++------- kprobe/kprobes.h | 4 +- kprobe/kprobes_arch.c | 56 +++++++++- 3 files changed, 307 insertions(+), 39 deletions(-) diff --git a/kprobe/kprobes.c b/kprobe/kprobes.c index 7e5a524..3848a7e 100644 --- a/kprobe/kprobes.c +++ b/kprobe/kprobes.c @@ -820,6 +820,7 @@ valid_p: int __kprobes register_ujprobe (struct task_struct *task, struct mm_struct *mm, struct jprobe *jp, int atomic) { + int ret = 0; #ifdef _DEBUG gSilent = 0; #endif @@ -827,7 +828,7 @@ register_ujprobe (struct task_struct *task, struct mm_struct *mm, struct jprobe jp->kp.pre_handler = setjmp_pre_handler; jp->kp.break_handler = longjmp_break_handler; - int ret = __register_uprobe (&jp->kp, task, atomic, + ret = __register_uprobe (&jp->kp, task, atomic, (unsigned long) __builtin_return_address (0)); #ifdef _DEBUG @@ -1302,57 +1303,270 @@ unregister_all_uprobes (struct task_struct *task, int atomic) purge_garbage_uslots(task, atomic); } -#if 0 + +#define GUP_FLAGS_WRITE 0x1 +#define GUP_FLAGS_FORCE 0x2 +#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 +#define GUP_FLAGS_IGNORE_SIGKILL 0x8 + + +static inline int use_zero_page(struct vm_area_struct *vma) +{ + /* + * We don't want to optimize FOLL_ANON for make_pages_present() + * when it tries to page in a VM_LOCKED region. As to VM_SHARED, + * we want to get the page from the page tables to make sure + * that we serialize and update with any other user of that + * mapping. + */ + if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) + return 0; + /* + * And if we have a fault routine, it's not an anonymous region. + */ + return !vma->vm_ops || !vma->vm_ops->fault; +} + +int __get_user_pages_uprobe(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int flags, + struct page **pages, struct vm_area_struct **vmas) +{ + int i; + unsigned int vm_flags = 0; + int write = !!(flags & GUP_FLAGS_WRITE); + int force = !!(flags & GUP_FLAGS_FORCE); + int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); + int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); + + if (len <= 0) + return 0; + /* + * Require read or write permissions. + * If 'force' is set, we only require the "MAY" flags. + */ + vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + i = 0; + + do { + struct vm_area_struct *vma; + unsigned int foll_flags; + + //vma = find_extend_vma(mm, start); + vma = find_vma(mm, start); + if (!vma && in_gate_area(tsk, start)) { + unsigned long pg = start & PAGE_MASK; + struct vm_area_struct *gate_vma = get_gate_vma(tsk); + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* user gate pages are read-only */ + if (!ignore && write) + return i ? : -EFAULT; + if (pg > TASK_SIZE) + pgd = pgd_offset_k(pg); + else + pgd = pgd_offset_gate(mm, pg); + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, pg); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, pg); + if (pmd_none(*pmd)) + return i ? : -EFAULT; + pte = pte_offset_map(pmd, pg); + if (pte_none(*pte)) { + pte_unmap(pte); + return i ? : -EFAULT; + } + if (pages) { + struct page *page = vm_normal_page(gate_vma, start, *pte); + pages[i] = page; + if (page) + get_page(page); + } + pte_unmap(pte); + if (vmas) + vmas[i] = gate_vma; + i++; + start += PAGE_SIZE; + len--; + continue; + } + + if (!vma || + (vma->vm_flags & (VM_IO | VM_PFNMAP)) || + (!ignore && !(vm_flags & vma->vm_flags))) + return i ? : -EFAULT; + + if (is_vm_hugetlb_page(vma)) { + i = follow_hugetlb_page(mm, vma, pages, vmas, + &start, &len, i, write); + continue; + } + + foll_flags = FOLL_TOUCH; + if (pages) + foll_flags |= FOLL_GET; + if (!write && use_zero_page(vma)) + foll_flags |= FOLL_ANON; + + do { + struct page *page; + + /* + * If we have a pending SIGKILL, don't keep faulting + * pages and potentially allocating memory, unless + * current is handling munlock--e.g., on exit. In + * that case, we are not allocating memory. Rather, + * we're only unlocking already resident/mapped pages. + */ + if (unlikely(!ignore_sigkill && + fatal_signal_pending(current))) + return i ? i : -ERESTARTSYS; + + if (write) + foll_flags |= FOLL_WRITE; + + + //cond_resched(); + + DBPRINTF ("pages = %p vma = %p\n", pages, vma); + while (!(page = follow_page(vma, start, foll_flags))) { + int ret; + ret = handle_mm_fault(mm, vma, start, + foll_flags & FOLL_WRITE); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return i ? i : -ENOMEM; + else if (ret & VM_FAULT_SIGBUS) + return i ? i : -EFAULT; + BUG(); + } + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + + /* + * The VM_FAULT_WRITE bit tells us that + * do_wp_page has broken COW when necessary, + * even if maybe_mkwrite decided not to set + * pte_write. We can thus safely do subsequent + * page lookups as if they were reads. But only + * do so when looping for pte_write is futile: + * in some cases userspace may also be wanting + * to write to the gotten user page, which a + * read fault here might prevent (a readonly + * page might get reCOWed by userspace write). + */ + if ((ret & VM_FAULT_WRITE) && + !(vma->vm_flags & VM_WRITE)) + foll_flags &= ~FOLL_WRITE; + + //cond_resched(); + } + if (IS_ERR(page)) + return i ? i : PTR_ERR(page); + if (pages) { + pages[i] = page; + + flush_anon_page(vma, page, start); + flush_dcache_page(page); + } + if (vmas) + vmas[i] = vma; + i++; + start += PAGE_SIZE; + len--; + } while (len && start < vma->vm_end); + } while (len); + return i; +} + +int get_user_pages_uprobe(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +{ + int flags = 0; + + if (write) + flags |= GUP_FLAGS_WRITE; + if (force) + flags |= GUP_FLAGS_FORCE; + + return __get_user_pages_uprobe(tsk, mm, + start, len, flags, + pages, vmas); +} + int -access_process_vm (struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +access_process_vm_atomic (struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) { - struct mm_struct *mm; + + + struct mm_struct *mm; struct vm_area_struct *vma; - struct page *page; void *old_buf = buf; - mm = get_task_mm (tsk); + mm = get_task_mm(tsk); if (!mm) return 0; - down_read (&mm->mmap_sem); - /* ignore errors, just check how much was sucessfully transfered */ - while (len) - { + down_read(&mm->mmap_sem); + /* ignore errors, just check how much was successfully transferred */ + while (len) { int bytes, ret, offset; void *maddr; - - ret = get_user_pages (tsk, mm, addr, 1, write, 1, &page, &vma); - if (ret <= 0) - break; - - bytes = len; - offset = addr & (PAGE_SIZE - 1); - if (bytes > PAGE_SIZE - offset) - bytes = PAGE_SIZE - offset; - - maddr = kmap (page); //, KM_USER0); - if (write) - { - copy_to_user_page (vma, page, addr, maddr + offset, buf, bytes); - set_page_dirty_lock (page); - } - else - { - copy_from_user_page (vma, page, addr, buf, maddr + offset, bytes); + struct page *page = NULL; + + ret = get_user_pages_uprobe(tsk, mm, addr, 1, + write, 1, &page, &vma); + if (ret <= 0) { + /* + * Check if this is a VM_IO | VM_PFNMAP VMA, which + * we can access using slightly different code. + */ +#ifdef CONFIG_HAVE_IOREMAP_PROT + vma = find_vma(mm, addr); + if (!vma) + break; + if (vma->vm_ops && vma->vm_ops->access) + ret = vma->vm_ops->access(vma, addr, buf, + len, write); + if (ret <= 0) +#endif + break; + bytes = ret; + } else { + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap(page); + if (write) { + copy_to_user_page(vma, page, addr, + maddr + offset, buf, bytes); + set_page_dirty_lock(page); + } else { + copy_from_user_page(vma, page, addr, + buf, maddr + offset, bytes); + } + kunmap(page); + page_cache_release(page); } - kunmap (page); //, KM_USER0); - page_cache_release (page); len -= bytes; buf += bytes; addr += bytes; } - up_read (&mm->mmap_sem); - mmput (mm); + up_read(&mm->mmap_sem); + mmput(mm); return buf - old_buf; + } -#endif #ifdef CONFIG_DEBUG_FS const char *(*__real_kallsyms_lookup) (unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, char *namebuf); @@ -1732,7 +1946,7 @@ EXPORT_SYMBOL_GPL (unregister_kretprobe); EXPORT_SYMBOL_GPL (register_uretprobe); EXPORT_SYMBOL_GPL (unregister_uretprobe); EXPORT_SYMBOL_GPL (unregister_all_uprobes); -//EXPORT_SYMBOL_GPL (access_process_vm_atomic); +EXPORT_SYMBOL_GPL (access_process_vm_atomic); #if LINUX_VERSION_CODE != KERNEL_VERSION(2,6,23) EXPORT_SYMBOL_GPL (access_process_vm); #endif @@ -1741,4 +1955,4 @@ EXPORT_SYMBOL_GPL (is_page_present); #else EXPORT_SYMBOL_GPL (page_present); #endif -//EXPORT_SYMBOL_GPL(get_user_pages_atomic); + diff --git a/kprobe/kprobes.h b/kprobe/kprobes.h index 8ca0ca1..82b0235 100644 --- a/kprobe/kprobes.h +++ b/kprobe/kprobes.h @@ -246,8 +246,8 @@ void set_normalized_timeval (struct timeval *tv, time_t sec, suseconds_t usec); kprobe_opcode_t *get_insn_slot (struct task_struct *task, int atomic); void free_insn_slot (struct hlist_head *page_list, struct task_struct *task, kprobe_opcode_t *slot, int dirty); -//int access_process_vm_atomic(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); -#define access_process_vm_atomic access_process_vm +int access_process_vm_atomic(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); + #define read_proc_vm_atomic(tsk, addr, buf, len) access_process_vm_atomic(tsk, addr, buf, len, 0) #define write_proc_vm_atomic(tsk, addr, buf, len) access_process_vm_atomic(tsk, addr, buf, len, 1) int page_present (struct mm_struct *mm, unsigned long addr); diff --git a/kprobe/kprobes_arch.c b/kprobe/kprobes_arch.c index 46c2e19..e3bfc03 100644 --- a/kprobe/kprobes_arch.c +++ b/kprobe/kprobes_arch.c @@ -1572,6 +1572,12 @@ collect_one_slot (struct hlist_head *page_list, struct task_struct *task, else { if(task){ +//E. G.: This code provides kernel dump because of rescheduling while atomic. +//As workaround, this code was commented. In this case we will have memory leaks +//for instrumented process, but instrumentation process should functionate correctly. +//Planned that good solution for this problem will be done during redesigning KProbe +//for improving supportability and performance. +#if 0 //printk("collect_one_slot %p/%d\n", task, task->pid); mm = get_task_mm (task); if (mm){ @@ -1580,6 +1586,8 @@ collect_one_slot (struct hlist_head *page_list, struct task_struct *task, up_write (&mm->mmap_sem); mmput(mm); } +#endif + kip->insns = NULL; //workaround kip->tgid = 0; } else { @@ -2314,7 +2322,7 @@ setjmp_pre_handler (struct kprobe *p, struct pt_regs *regs) rcu_read_unlock(); } if (pre_entry) - p->ss_addr = pre_entry (jp->priv_arg, regs); + p->ss_addr = (void *)pre_entry (jp->priv_arg, regs); if (entry){ # if defined(CONFIG_MIPS) entry (regs->regs[4], regs->regs[5], regs->regs[6], regs->regs[7], regs->regs[8], regs->regs[9]); @@ -3080,6 +3088,17 @@ static struct jprobe do_exit_p = // persistent deps DECLARE_MOD_CB_DEP(kallsyms_search, unsigned long, const char *name); DECLARE_MOD_FUNC_DEP(access_process_vm, int, struct task_struct * tsk, unsigned long addr, void *buf, int len, int write); + +DECLARE_MOD_FUNC_DEP(find_extend_vma, struct vm_area_struct *, struct mm_struct * mm, unsigned long addr); +DECLARE_MOD_FUNC_DEP(handle_mm_fault, int, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access); +DECLARE_MOD_FUNC_DEP(get_gate_vma, struct vm_area_struct *, struct task_struct *tsk); +DECLARE_MOD_FUNC_DEP(in_gate_area_no_task, int, unsigned long addr); +DECLARE_MOD_FUNC_DEP(follow_page, struct page *, struct vm_area_struct * vma, unsigned long address, unsigned int foll_flags); +DECLARE_MOD_FUNC_DEP(__flush_anon_page, void, struct vm_area_struct *vma, struct page *page, unsigned long vmaddr); +DECLARE_MOD_FUNC_DEP(vm_normal_page, struct page *, struct vm_area_struct *vma, unsigned long addr, pte_t pte); +DECLARE_MOD_FUNC_DEP(flush_ptrace_access, void, struct vm_area_struct *vma, struct page *page, unsigned long uaddr, void *kaddr, unsigned long len, int write); + + // deps controled by config macros #ifdef KERNEL_HAS_ISPAGEPRESENT DECLARE_MOD_FUNC_DEP(is_page_present, int, struct mm_struct * mm, unsigned long address); @@ -3128,6 +3147,32 @@ DECLARE_MOD_FUNC_DEP(put_task_struct, void, struct rcu_head * rhp); // persistent deps DECLARE_MOD_DEP_WRAPPER(access_process_vm, int, struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) IMP_MOD_DEP_WRAPPER(access_process_vm, tsk, addr, buf, len, write) + +DECLARE_MOD_DEP_WRAPPER (find_extend_vma, struct vm_area_struct *, struct mm_struct * mm, unsigned long addr) +IMP_MOD_DEP_WRAPPER (find_extend_vma, mm, addr) + +DECLARE_MOD_DEP_WRAPPER (handle_mm_fault, int, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access) +IMP_MOD_DEP_WRAPPER (handle_mm_fault, mm, vma, address, write_access) + +DECLARE_MOD_DEP_WRAPPER (get_gate_vma, struct vm_area_struct *, struct task_struct *tsk) +IMP_MOD_DEP_WRAPPER (get_gate_vma, tsk) + +DECLARE_MOD_DEP_WRAPPER (in_gate_area_no_task, int, unsigned long addr) +IMP_MOD_DEP_WRAPPER (in_gate_area_no_task, addr) + +DECLARE_MOD_DEP_WRAPPER (follow_page, struct page *, struct vm_area_struct * vma, unsigned long address, unsigned int foll_flags) +IMP_MOD_DEP_WRAPPER (follow_page, vma, address, foll_flags) + +DECLARE_MOD_DEP_WRAPPER (__flush_anon_page, void, struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) +IMP_MOD_DEP_WRAPPER (__flush_anon_page, vma, page, vmaddr) + +DECLARE_MOD_DEP_WRAPPER(vm_normal_page, struct page *, struct vm_area_struct *vma, unsigned long addr, pte_t pte) +IMP_MOD_DEP_WRAPPER (vm_normal_page, vma, addr, pte) + +DECLARE_MOD_DEP_WRAPPER (flush_ptrace_access, void, struct vm_area_struct *vma, struct page *page, unsigned long uaddr, void *kaddr, unsigned long len, int write) +IMP_MOD_DEP_WRAPPER (flush_ptrace_access, vma, page, uaddr, kaddr, len, write) + + // deps controled by config macros #ifdef KERNEL_HAS_ISPAGEPRESENT int is_page_present (struct mm_struct *mm, unsigned long address) @@ -3219,6 +3264,15 @@ int __init arch_init_kprobes (void) sched_addr = (kprobe_opcode_t *)kallsyms_search("__switch_to");//"schedule"); fork_addr = (kprobe_opcode_t *)kallsyms_search("do_fork"); + + INIT_MOD_DEP_VAR(handle_mm_fault, handle_mm_fault); + INIT_MOD_DEP_VAR(flush_ptrace_access, flush_ptrace_access); + INIT_MOD_DEP_VAR(find_extend_vma, find_extend_vma); + INIT_MOD_DEP_VAR(get_gate_vma, get_gate_vma); + INIT_MOD_DEP_VAR(in_gate_area_no_task, in_gate_area_no_task); + INIT_MOD_DEP_VAR(follow_page, follow_page); + INIT_MOD_DEP_VAR(__flush_anon_page, __flush_anon_page); + INIT_MOD_DEP_VAR(vm_normal_page, vm_normal_page); INIT_MOD_DEP_VAR(access_process_vm, access_process_vm); #ifdef KERNEL_HAS_ISPAGEPRESENT -- 2.7.4