Put x86 entry code into a separate link section: .entry.text.
Separating the entry text section seems to have performance
benefits - caused by more efficient instruction cache usage.
Running hackbench with perf stat --repeat showed that the change
compresses the icache footprint. The icache load miss rate went
down by about 15%:
before patch:
19417627 L1-icache-load-misses ( +- 0.147% )
after patch:
16490788 L1-icache-load-misses ( +- 0.180% )
The motivation of the patch was to fix a particular kprobes
bug that relates to the entry text section, the performance
advantage was discovered accidentally.
Whole perf output follows:
- results for current tip tree:
Performance counter stats for './hackbench/hackbench 10' (500 runs):
19417627 L1-icache-load-misses ( +- 0.147% )
2676914223 instructions # 0.497 IPC ( +- 0.079% )
5389516026 cycles ( +- 0.144% )
0.
206267711 seconds time elapsed ( +- 0.138% )
- results for current tip tree with the patch applied:
Performance counter stats for './hackbench/hackbench 10' (500 runs):
16490788 L1-icache-load-misses ( +- 0.180% )
2717734941 instructions # 0.502 IPC ( +- 0.079% )
5414756975 cycles ( +- 0.148% )
0.
206747566 seconds time elapsed ( +- 0.137% )
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: masami.hiramatsu.pt@hitachi.com
Cc: ananth@in.ibm.com
Cc: davem@davemloft.net
Cc: 2nddept-manager@sdl.hitachi.co.jp
LKML-Reference: <
20110307181039.GB15197@jolsa.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
#define sysretl_audit ia32_ret_from_sys_call
#endif
+ .section .entry.text, "ax"
+
#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
.macro IA32_ARG_FIXUP noebp=0
#define sysexit_audit syscall_exit_work
#endif
+ .section .entry.text, "ax"
+
/*
* We use macros for low-level operations which need to be overridden
* for paravirtualization. The following will never clobber any registers:
*/
.section .init.rodata,"a"
ENTRY(interrupt)
-.text
+.section .entry.text, "ax"
.p2align 5
.p2align CONFIG_X86_L1_CACHE_SHIFT
ENTRY(irq_entries_start)
.endif
.previous
.long 1b
- .text
+ .section .entry.text, "ax"
vector=vector+1
.endif
.endr
#define __AUDIT_ARCH_LE 0x40000000
.code64
+ .section .entry.text, "ax"
+
#ifdef CONFIG_FUNCTION_TRACER
#ifdef CONFIG_DYNAMIC_FTRACE
ENTRY(mcount)
*/
.section .init.rodata,"a"
ENTRY(interrupt)
- .text
+ .section .entry.text
.p2align 5
.p2align CONFIG_X86_L1_CACHE_SHIFT
ENTRY(irq_entries_start)
.endif
.previous
.quad 1b
- .text
+ .section .entry.text
vector=vector+1
.endif
.endr
SCHED_TEXT
LOCK_TEXT
KPROBES_TEXT
+ ENTRY_TEXT
IRQENTRY_TEXT
*(.fixup)
*(.gnu.warning)
extern char _end[];
extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
extern char __kprobes_text_start[], __kprobes_text_end[];
+extern char __entry_text_start[], __entry_text_end[];
extern char __initdata_begin[], __initdata_end[];
extern char __start_rodata[], __end_rodata[];
*(.kprobes.text) \
VMLINUX_SYMBOL(__kprobes_text_end) = .;
+#define ENTRY_TEXT \
+ ALIGN_FUNCTION(); \
+ VMLINUX_SYMBOL(__entry_text_start) = .; \
+ *(.entry.text) \
+ VMLINUX_SYMBOL(__entry_text_end) = .;
+
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
#define IRQENTRY_TEXT \
ALIGN_FUNCTION(); \