*/
#include <asm/ppc_asm.h>
+#ifdef __BIG_ENDIAN__
+#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
+#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
+#else
+#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
+#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
+#endif
+
.macro err1
100:
.section __ex_table,"a"
li r10,32
li r11,48
- lvsl vr16,0,r4 /* Setup permute control vector */
+ LVS(vr16,0,r4) /* Setup permute control vector */
err3; lvx vr0,0,r4
addi r4,r4,16
bf cr7*4+3,5f
err3; lvx vr1,r0,r4
- vperm vr8,vr0,vr1,vr16
+ VPERM(vr8,vr0,vr1,vr16)
addi r4,r4,16
err3; stvx vr8,r0,r3
addi r3,r3,16
5: bf cr7*4+2,6f
err3; lvx vr1,r0,r4
- vperm vr8,vr0,vr1,vr16
+ VPERM(vr8,vr0,vr1,vr16)
err3; lvx vr0,r4,r9
- vperm vr9,vr1,vr0,vr16
+ VPERM(vr9,vr1,vr0,vr16)
addi r4,r4,32
err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9
6: bf cr7*4+1,7f
err3; lvx vr3,r0,r4
- vperm vr8,vr0,vr3,vr16
+ VPERM(vr8,vr0,vr3,vr16)
err3; lvx vr2,r4,r9
- vperm vr9,vr3,vr2,vr16
+ VPERM(vr9,vr3,vr2,vr16)
err3; lvx vr1,r4,r10
- vperm vr10,vr2,vr1,vr16
+ VPERM(vr10,vr2,vr1,vr16)
err3; lvx vr0,r4,r11
- vperm vr11,vr1,vr0,vr16
+ VPERM(vr11,vr1,vr0,vr16)
addi r4,r4,64
err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9
.align 5
8:
err4; lvx vr7,r0,r4
- vperm vr8,vr0,vr7,vr16
+ VPERM(vr8,vr0,vr7,vr16)
err4; lvx vr6,r4,r9
- vperm vr9,vr7,vr6,vr16
+ VPERM(vr9,vr7,vr6,vr16)
err4; lvx vr5,r4,r10
- vperm vr10,vr6,vr5,vr16
+ VPERM(vr10,vr6,vr5,vr16)
err4; lvx vr4,r4,r11
- vperm vr11,vr5,vr4,vr16
+ VPERM(vr11,vr5,vr4,vr16)
err4; lvx vr3,r4,r12
- vperm vr12,vr4,vr3,vr16
+ VPERM(vr12,vr4,vr3,vr16)
err4; lvx vr2,r4,r14
- vperm vr13,vr3,vr2,vr16
+ VPERM(vr13,vr3,vr2,vr16)
err4; lvx vr1,r4,r15
- vperm vr14,vr2,vr1,vr16
+ VPERM(vr14,vr2,vr1,vr16)
err4; lvx vr0,r4,r16
- vperm vr15,vr1,vr0,vr16
+ VPERM(vr15,vr1,vr0,vr16)
addi r4,r4,128
err4; stvx vr8,r0,r3
err4; stvx vr9,r3,r9
bf cr7*4+1,9f
err3; lvx vr3,r0,r4
- vperm vr8,vr0,vr3,vr16
+ VPERM(vr8,vr0,vr3,vr16)
err3; lvx vr2,r4,r9
- vperm vr9,vr3,vr2,vr16
+ VPERM(vr9,vr3,vr2,vr16)
err3; lvx vr1,r4,r10
- vperm vr10,vr2,vr1,vr16
+ VPERM(vr10,vr2,vr1,vr16)
err3; lvx vr0,r4,r11
- vperm vr11,vr1,vr0,vr16
+ VPERM(vr11,vr1,vr0,vr16)
addi r4,r4,64
err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9
9: bf cr7*4+2,10f
err3; lvx vr1,r0,r4
- vperm vr8,vr0,vr1,vr16
+ VPERM(vr8,vr0,vr1,vr16)
err3; lvx vr0,r4,r9
- vperm vr9,vr1,vr0,vr16
+ VPERM(vr9,vr1,vr0,vr16)
addi r4,r4,32
err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9
10: bf cr7*4+3,11f
err3; lvx vr1,r0,r4
- vperm vr8,vr0,vr1,vr16
+ VPERM(vr8,vr0,vr1,vr16)
addi r4,r4,16
err3; stvx vr8,r0,r3
addi r3,r3,16
#include <asm/ppc_asm.h>
_GLOBAL(memcpy_power7)
+
+#ifdef __BIG_ENDIAN__
+#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
+#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
+#else
+#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
+#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
+#endif
+
#ifdef CONFIG_ALTIVEC
cmpldi r5,16
cmpldi cr1,r5,4096
li r10,32
li r11,48
- lvsl vr16,0,r4 /* Setup permute control vector */
+ LVS(vr16,0,r4) /* Setup permute control vector */
lvx vr0,0,r4
addi r4,r4,16
bf cr7*4+3,5f
lvx vr1,r0,r4
- vperm vr8,vr0,vr1,vr16
+ VPERM(vr8,vr0,vr1,vr16)
addi r4,r4,16
stvx vr8,r0,r3
addi r3,r3,16
5: bf cr7*4+2,6f
lvx vr1,r0,r4
- vperm vr8,vr0,vr1,vr16
+ VPERM(vr8,vr0,vr1,vr16)
lvx vr0,r4,r9
- vperm vr9,vr1,vr0,vr16
+ VPERM(vr9,vr1,vr0,vr16)
addi r4,r4,32
stvx vr8,r0,r3
stvx vr9,r3,r9
6: bf cr7*4+1,7f
lvx vr3,r0,r4
- vperm vr8,vr0,vr3,vr16
+ VPERM(vr8,vr0,vr3,vr16)
lvx vr2,r4,r9
- vperm vr9,vr3,vr2,vr16
+ VPERM(vr9,vr3,vr2,vr16)
lvx vr1,r4,r10
- vperm vr10,vr2,vr1,vr16
+ VPERM(vr10,vr2,vr1,vr16)
lvx vr0,r4,r11
- vperm vr11,vr1,vr0,vr16
+ VPERM(vr11,vr1,vr0,vr16)
addi r4,r4,64
stvx vr8,r0,r3
stvx vr9,r3,r9
.align 5
8:
lvx vr7,r0,r4
- vperm vr8,vr0,vr7,vr16
+ VPERM(vr8,vr0,vr7,vr16)
lvx vr6,r4,r9
- vperm vr9,vr7,vr6,vr16
+ VPERM(vr9,vr7,vr6,vr16)
lvx vr5,r4,r10
- vperm vr10,vr6,vr5,vr16
+ VPERM(vr10,vr6,vr5,vr16)
lvx vr4,r4,r11
- vperm vr11,vr5,vr4,vr16
+ VPERM(vr11,vr5,vr4,vr16)
lvx vr3,r4,r12
- vperm vr12,vr4,vr3,vr16
+ VPERM(vr12,vr4,vr3,vr16)
lvx vr2,r4,r14
- vperm vr13,vr3,vr2,vr16
+ VPERM(vr13,vr3,vr2,vr16)
lvx vr1,r4,r15
- vperm vr14,vr2,vr1,vr16
+ VPERM(vr14,vr2,vr1,vr16)
lvx vr0,r4,r16
- vperm vr15,vr1,vr0,vr16
+ VPERM(vr15,vr1,vr0,vr16)
addi r4,r4,128
stvx vr8,r0,r3
stvx vr9,r3,r9
bf cr7*4+1,9f
lvx vr3,r0,r4
- vperm vr8,vr0,vr3,vr16
+ VPERM(vr8,vr0,vr3,vr16)
lvx vr2,r4,r9
- vperm vr9,vr3,vr2,vr16
+ VPERM(vr9,vr3,vr2,vr16)
lvx vr1,r4,r10
- vperm vr10,vr2,vr1,vr16
+ VPERM(vr10,vr2,vr1,vr16)
lvx vr0,r4,r11
- vperm vr11,vr1,vr0,vr16
+ VPERM(vr11,vr1,vr0,vr16)
addi r4,r4,64
stvx vr8,r0,r3
stvx vr9,r3,r9
9: bf cr7*4+2,10f
lvx vr1,r0,r4
- vperm vr8,vr0,vr1,vr16
+ VPERM(vr8,vr0,vr1,vr16)
lvx vr0,r4,r9
- vperm vr9,vr1,vr0,vr16
+ VPERM(vr9,vr1,vr0,vr16)
addi r4,r4,32
stvx vr8,r0,r3
stvx vr9,r3,r9
10: bf cr7*4+3,11f
lvx vr1,r0,r4
- vperm vr8,vr0,vr1,vr16
+ VPERM(vr8,vr0,vr1,vr16)
addi r4,r4,16
stvx vr8,r0,r3
addi r3,r3,16