powerpc: Improve 64bit copy_tofrom_user
authorAnton Blanchard <anton@samba.org>
Wed, 10 Feb 2010 14:56:26 +0000 (14:56 +0000)
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>
Wed, 17 Feb 2010 03:03:16 +0000 (14:03 +1100)
Here is a patch from Paul Mackerras that improves the ppc64 copy_tofrom_user.
The loop now does 32 bytes at a time and as well as pairing loads and stores.

A quick test case that reads 8kB over and over shows the improvement:

POWER6: 53% faster
POWER7: 51% faster

#define _XOPEN_SOURCE 500
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>

#define BUFSIZE (8 * 1024)
#define ITERATIONS 10000000

int main()
{
char tmpfile[] = "/tmp/copy_to_user_testXXXXXX";
int fd;
char *buf[BUFSIZE];
unsigned long i;

fd = mkstemp(tmpfile);
if (fd < 0) {
perror("open");
exit(1);
}

if (write(fd, buf, BUFSIZE) != BUFSIZE) {
perror("open");
exit(1);
}

for (i = 0; i < 10000000; i++) {
if (pread(fd, buf, BUFSIZE, 0) != BUFSIZE) {
perror("pread");
exit(1);
}
}

unlink(tmpfile);

return 0;
}

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
arch/powerpc/lib/copyuser_64.S

index 693b14a..578b625 100644 (file)
@@ -44,37 +44,55 @@ BEGIN_FTR_SECTION
        andi.   r0,r4,7
        bne     .Lsrc_unaligned
 END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
-       srdi    r7,r5,4
-20:    ld      r9,0(r4)
-       addi    r4,r4,-8
-       mtctr   r7
-       andi.   r5,r5,7
-       bf      cr7*4+0,22f
-       addi    r3,r3,8
-       addi    r4,r4,8
-       mr      r8,r9
-       blt     cr1,72f
-21:    ld      r9,8(r4)
-70:    std     r8,8(r3)
-22:    ldu     r8,16(r4)
-71:    stdu    r9,16(r3)
+       blt     cr1,.Ldo_tail           /* if < 16 bytes to copy */
+       srdi    r0,r5,5
+       cmpdi   cr1,r0,0
+20:    ld      r7,0(r4)
+220:   ld      r6,8(r4)
+       addi    r4,r4,16
+       mtctr   r0
+       andi.   r0,r5,0x10
+       beq     22f
+       addi    r3,r3,16
+       addi    r4,r4,-16
+       mr      r9,r7
+       mr      r8,r6
+       beq     cr1,72f
+21:    ld      r7,16(r4)
+221:   ld      r6,24(r4)
+       addi    r4,r4,32
+70:    std     r9,0(r3)
+270:   std     r8,8(r3)
+22:    ld      r9,0(r4)
+222:   ld      r8,8(r4)
+71:    std     r7,16(r3)
+271:   std     r6,24(r3)
+       addi    r3,r3,32
        bdnz    21b
-72:    std     r8,8(r3)
+72:    std     r9,0(r3)
+272:   std     r8,8(r3)
+       andi.   r5,r5,0xf
        beq+    3f
-       addi    r3,r3,16
+       addi    r4,r4,16
 .Ldo_tail:
-       bf      cr7*4+1,1f
-23:    lwz     r9,8(r4)
+       addi    r3,r3,16
+       bf      cr7*4+0,246f
+244:   ld      r9,0(r4)
+       addi    r4,r4,8
+245:   std     r9,0(r3)
+       addi    r3,r3,8
+246:   bf      cr7*4+1,1f
+23:    lwz     r9,0(r4)
        addi    r4,r4,4
 73:    stw     r9,0(r3)
        addi    r3,r3,4
 1:     bf      cr7*4+2,2f
-44:    lhz     r9,8(r4)
+44:    lhz     r9,0(r4)
        addi    r4,r4,2
 74:    sth     r9,0(r3)
        addi    r3,r3,2
 2:     bf      cr7*4+3,3f
-45:    lbz     r9,8(r4)
+45:    lbz     r9,0(r4)
 75:    stb     r9,0(r3)
 3:     li      r3,0
        blr
@@ -220,7 +238,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 131:
        addi    r3,r3,8
 120:
+320:
 122:
+322:
 124:
 125:
 126:
@@ -229,9 +249,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 129:
 133:
        addi    r3,r3,8
-121:
 132:
        addi    r3,r3,8
+121:
+321:
+344:
 134:
 135:
 138:
@@ -303,18 +325,22 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 183:
        add     r3,r3,r7
        b       1f
+371:
 180:
        addi    r3,r3,8
 171:
 177:
        addi    r3,r3,8
-170:
-172:
+370:
+372:
 176:
 178:
        addi    r3,r3,4
 185:
        addi    r3,r3,4
+170:
+172:
+345:
 173:
 174:
 175:
@@ -341,11 +367,19 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
        .section __ex_table,"a"
        .align  3
        .llong  20b,120b
+       .llong  220b,320b
        .llong  21b,121b
+       .llong  221b,321b
        .llong  70b,170b
+       .llong  270b,370b
        .llong  22b,122b
+       .llong  222b,322b
        .llong  71b,171b
+       .llong  271b,371b
        .llong  72b,172b
+       .llong  272b,372b
+       .llong  244b,344b
+       .llong  245b,345b
        .llong  23b,123b
        .llong  73b,173b
        .llong  44b,144b