[PATCH] Prefetch kernel stacks to speed up context switch
authorChen, Kenneth W <kenneth.w.chen@intel.com>
Fri, 9 Sep 2005 20:02:02 +0000 (13:02 -0700)
committerLinus Torvalds <torvalds@g5.osdl.org>
Fri, 9 Sep 2005 20:57:31 +0000 (13:57 -0700)
For architecture like ia64, the switch stack structure is fairly large
(currently 528 bytes).  For context switch intensive application, we found
that significant amount of cache misses occurs in switch_to() function.
The following patch adds a hook in the schedule() function to prefetch
switch stack structure as soon as 'next' task is determined.  This allows
maximum overlap in prefetch cache lines for that structure.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
arch/ia64/kernel/entry.S
include/asm-ia64/system.h
include/linux/sched.h
kernel/sched.c

index 3c882102450999450b783d4b8642d4c74f4bae03..915e12791836562a3441d6c4d51bbb6664a7ee37 100644 (file)
@@ -470,6 +470,29 @@ ENTRY(load_switch_stack)
        br.cond.sptk.many b7
 END(load_switch_stack)
 
+GLOBAL_ENTRY(prefetch_stack)
+       add r14 = -IA64_SWITCH_STACK_SIZE, sp
+       add r15 = IA64_TASK_THREAD_KSP_OFFSET, in0
+       ;;
+       ld8 r16 = [r15]                         // load next's stack pointer
+       lfetch.fault.excl [r14], 128
+       ;;
+       lfetch.fault.excl [r14], 128
+       lfetch.fault [r16], 128
+       ;;
+       lfetch.fault.excl [r14], 128
+       lfetch.fault [r16], 128
+       ;;
+       lfetch.fault.excl [r14], 128
+       lfetch.fault [r16], 128
+       ;;
+       lfetch.fault.excl [r14], 128
+       lfetch.fault [r16], 128
+       ;;
+       lfetch.fault [r16], 128
+       br.ret.sptk.many rp
+END(prefetch_switch_stack)
+
 GLOBAL_ENTRY(execve)
        mov r15=__NR_execve                     // put syscall number in place
        break __BREAK_SYSCALL
index 33256db4a7cf1b0ad4f82cdcf935b0bd421a42ed..635235fa1e32606c5955653bed40d0a77cb4899c 100644 (file)
@@ -275,6 +275,7 @@ extern void ia64_load_extra (struct task_struct *task);
  */
 #define __ARCH_WANT_UNLOCKED_CTXSW
 
+#define ARCH_HAS_PREFETCH_SWITCH_STACK
 #define ia64_platform_is(x) (strcmp(x, platform_name) == 0)
 
 void cpu_idle_wait(void);
index ea1b5f32ec5c1952cbf254f4563d7d707a85e155..c551e6a1447e57949e84435fe3fb846799d30f8b 100644 (file)
@@ -604,6 +604,11 @@ extern int groups_search(struct group_info *group_info, gid_t grp);
 #define GROUP_AT(gi, i) \
     ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
 
+#ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
+extern void prefetch_stack(struct task_struct*);
+#else
+static inline void prefetch_stack(struct task_struct *t) { }
+#endif
 
 struct audit_context;          /* See audit.c */
 struct mempolicy;
index 18b95520a2e29bdfa181aecb913f0ea50995f609..2632b812cf24a1b7ce0e109689793b8476362d42 100644 (file)
@@ -2888,6 +2888,7 @@ switch_tasks:
        if (next == rq->idle)
                schedstat_inc(rq, sched_goidle);
        prefetch(next);
+       prefetch_stack(next);
        clear_tsk_need_resched(prev);
        rcu_qsctr_inc(task_cpu(prev));