Here is a patch from Paul Mackerras that improves the ppc64 copy_tofrom_user.
The loop now does 32 bytes at a time and as well as pairing loads and stores.
A quick test case that reads 8kB over and over shows the improvement:
POWER6: 53% faster
POWER7: 51% faster
#define _XOPEN_SOURCE 500
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#define BUFSIZE (8 * 1024)
#define ITERATIONS
10000000
int main()
{
char tmpfile[] = "/tmp/copy_to_user_testXXXXXX";
int fd;
char *buf[BUFSIZE];
unsigned long i;
fd = mkstemp(tmpfile);
if (fd < 0) {
perror("open");
exit(1);
}
if (write(fd, buf, BUFSIZE) != BUFSIZE) {
perror("open");
exit(1);
}
for (i = 0; i <
10000000; i++) {
if (pread(fd, buf, BUFSIZE, 0) != BUFSIZE) {
perror("pread");
exit(1);
}
}
unlink(tmpfile);
return 0;
}
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
andi. r0,r4,7
bne .Lsrc_unaligned
END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
- srdi r7,r5,4
-20: ld r9,0(r4)
- addi r4,r4,-8
- mtctr r7
- andi. r5,r5,7
- bf cr7*4+0,22f
- addi r3,r3,8
- addi r4,r4,8
- mr r8,r9
- blt cr1,72f
-21: ld r9,8(r4)
-70: std r8,8(r3)
-22: ldu r8,16(r4)
-71: stdu r9,16(r3)
+ blt cr1,.Ldo_tail /* if < 16 bytes to copy */
+ srdi r0,r5,5
+ cmpdi cr1,r0,0
+20: ld r7,0(r4)
+220: ld r6,8(r4)
+ addi r4,r4,16
+ mtctr r0
+ andi. r0,r5,0x10
+ beq 22f
+ addi r3,r3,16
+ addi r4,r4,-16
+ mr r9,r7
+ mr r8,r6
+ beq cr1,72f
+21: ld r7,16(r4)
+221: ld r6,24(r4)
+ addi r4,r4,32
+70: std r9,0(r3)
+270: std r8,8(r3)
+22: ld r9,0(r4)
+222: ld r8,8(r4)
+71: std r7,16(r3)
+271: std r6,24(r3)
+ addi r3,r3,32
bdnz 21b
-72: std r8,8(r3)
+72: std r9,0(r3)
+272: std r8,8(r3)
+ andi. r5,r5,0xf
beq+ 3f
- addi r3,r3,16
+ addi r4,r4,16
.Ldo_tail:
- bf cr7*4+1,1f
-23: lwz r9,8(r4)
+ addi r3,r3,16
+ bf cr7*4+0,246f
+244: ld r9,0(r4)
+ addi r4,r4,8
+245: std r9,0(r3)
+ addi r3,r3,8
+246: bf cr7*4+1,1f
+23: lwz r9,0(r4)
addi r4,r4,4
73: stw r9,0(r3)
addi r3,r3,4
1: bf cr7*4+2,2f
-44: lhz r9,8(r4)
+44: lhz r9,0(r4)
addi r4,r4,2
74: sth r9,0(r3)
addi r3,r3,2
2: bf cr7*4+3,3f
-45: lbz r9,8(r4)
+45: lbz r9,0(r4)
75: stb r9,0(r3)
3: li r3,0
blr
131:
addi r3,r3,8
120:
+320:
122:
+322:
124:
125:
126:
129:
133:
addi r3,r3,8
-121:
132:
addi r3,r3,8
+121:
+321:
+344:
134:
135:
138:
183:
add r3,r3,r7
b 1f
+371:
180:
addi r3,r3,8
171:
177:
addi r3,r3,8
-170:
-172:
+370:
+372:
176:
178:
addi r3,r3,4
185:
addi r3,r3,4
+170:
+172:
+345:
173:
174:
175:
.section __ex_table,"a"
.align 3
.llong 20b,120b
+ .llong 220b,320b
.llong 21b,121b
+ .llong 221b,321b
.llong 70b,170b
+ .llong 270b,370b
.llong 22b,122b
+ .llong 222b,322b
.llong 71b,171b
+ .llong 271b,371b
.llong 72b,172b
+ .llong 272b,372b
+ .llong 244b,344b
+ .llong 245b,345b
.llong 23b,123b
.llong 73b,173b
.llong 44b,144b