2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
16 * Copyright (C) IBM Corporation, 2011
18 * Author: Anton Blanchard <anton@au.ibm.com>
20 #include <asm/ppc_asm.h>
24 .section __ex_table,"a"
32 .section __ex_table,"a"
41 .section __ex_table,"a"
49 .section __ex_table,"a"
57 ld r16,STK_REG(R16)(r1)
58 ld r15,STK_REG(R15)(r1)
59 ld r14,STK_REG(R14)(r1)
62 ld r0,STACKFRAMESIZE+16(r1)
65 #endif /* CONFIG_ALTIVEC */
68 ld r22,STK_REG(R22)(r1)
69 ld r21,STK_REG(R21)(r1)
70 ld r20,STK_REG(R20)(r1)
71 ld r19,STK_REG(R19)(r1)
72 ld r18,STK_REG(R18)(r1)
73 ld r17,STK_REG(R17)(r1)
74 ld r16,STK_REG(R16)(r1)
75 ld r15,STK_REG(R15)(r1)
76 ld r14,STK_REG(R14)(r1)
78 addi r1,r1,STACKFRAMESIZE
83 b __copy_tofrom_user_base
86 _GLOBAL(__copy_tofrom_user_power7)
108 /* Get the source 8B aligned */
136 stdu r1,-STACKFRAMESIZE(r1)
137 std r14,STK_REG(R14)(r1)
138 std r15,STK_REG(R15)(r1)
139 std r16,STK_REG(R16)(r1)
140 std r17,STK_REG(R17)(r1)
141 std r18,STK_REG(R18)(r1)
142 std r19,STK_REG(R19)(r1)
143 std r20,STK_REG(R20)(r1)
144 std r21,STK_REG(R21)(r1)
145 std r22,STK_REG(R22)(r1)
146 std r0,STACKFRAMESIZE+16(r1)
151 /* Now do cacheline (128B) sized loads and stores. */
184 err2; std r19,104(r3)
185 err2; std r20,112(r3)
186 err2; std r21,120(r3)
192 ld r14,STK_REG(R14)(r1)
193 ld r15,STK_REG(R15)(r1)
194 ld r16,STK_REG(R16)(r1)
195 ld r17,STK_REG(R17)(r1)
196 ld r18,STK_REG(R18)(r1)
197 ld r19,STK_REG(R19)(r1)
198 ld r20,STK_REG(R20)(r1)
199 ld r21,STK_REG(R21)(r1)
200 ld r22,STK_REG(R22)(r1)
201 addi r1,r1,STACKFRAMESIZE
203 /* Up to 127B to go */
227 /* Up to 63B to go */
240 /* Up to 31B to go */
249 9: clrldi r5,r5,(64-4)
251 /* Up to 15B to go */
255 err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
281 .Lunwind_stack_nonvmx_copy:
282 addi r1,r1,STACKFRAMESIZE
285 #ifdef CONFIG_ALTIVEC
289 stdu r1,-STACKFRAMESIZE(r1)
290 bl .enter_vmx_usercopy
292 ld r0,STACKFRAMESIZE+16(r1)
293 ld r3,STACKFRAMESIZE+48(r1)
294 ld r4,STACKFRAMESIZE+56(r1)
295 ld r5,STACKFRAMESIZE+64(r1)
299 * We prefetch both the source and destination using enhanced touch
300 * instructions. We use a stream ID of 0 for the load side and
301 * 1 for the store side.
305 ori r9,r9,1 /* stream=1 */
307 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
311 1: lis r0,0x0E00 /* depth=7 */
314 ori r10,r7,1 /* stream=1 */
316 lis r8,0x8000 /* GO=1 */
321 /* setup read stream 0 */
322 dcbt r0,r6,0b01000 /* addr from */
323 dcbt r0,r7,0b01010 /* length and depth from */
324 /* setup write stream 1 */
325 dcbtst r0,r9,0b01000 /* addr to */
326 dcbtst r0,r10,0b01010 /* length and depth to */
328 dcbt r0,r8,0b01010 /* all streams GO */
331 beq cr1,.Lunwind_stack_nonvmx_copy
334 * If source and destination are not relatively aligned we use a
335 * slower permute loop.
338 rldicl. r6,r6,0,(64-4)
339 bne .Lvmx_unaligned_copy
341 /* Get the destination 16B aligned */
372 /* Get the desination 128B aligned */
404 err3; stvx vr1,r3,r10
405 err3; stvx vr0,r3,r11
411 std r14,STK_REG(R14)(r1)
412 std r15,STK_REG(R15)(r1)
413 std r16,STK_REG(R16)(r1)
423 * Now do cacheline sized loads and stores. By this stage the
424 * cacheline stores are also cacheline aligned.
439 err4; stvx vr5,r3,r10
440 err4; stvx vr4,r3,r11
441 err4; stvx vr3,r3,r12
442 err4; stvx vr2,r3,r14
443 err4; stvx vr1,r3,r15
444 err4; stvx vr0,r3,r16
448 ld r14,STK_REG(R14)(r1)
449 ld r15,STK_REG(R15)(r1)
450 ld r16,STK_REG(R16)(r1)
452 /* Up to 127B to go */
465 err3; stvx vr1,r3,r10
466 err3; stvx vr0,r3,r11
483 /* Up to 15B to go */
484 11: clrldi r5,r5,(64-4)
508 15: addi r1,r1,STACKFRAMESIZE
509 b .exit_vmx_usercopy /* tail call optimise */
511 .Lvmx_unaligned_copy:
512 /* Get the destination 16B aligned */
536 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
545 /* Get the desination 128B aligned */
555 lvsl vr16,0,r4 /* Setup permute control vector */
561 vperm vr8,vr0,vr1,vr16
569 vperm vr8,vr0,vr1,vr16
571 vperm vr9,vr1,vr0,vr16
579 vperm vr8,vr0,vr3,vr16
581 vperm vr9,vr3,vr2,vr16
583 vperm vr10,vr2,vr1,vr16
585 vperm vr11,vr1,vr0,vr16
589 err3; stvx vr10,r3,r10
590 err3; stvx vr11,r3,r11
596 std r14,STK_REG(R14)(r1)
597 std r15,STK_REG(R15)(r1)
598 std r16,STK_REG(R16)(r1)
608 * Now do cacheline sized loads and stores. By this stage the
609 * cacheline stores are also cacheline aligned.
614 vperm vr8,vr0,vr7,vr16
616 vperm vr9,vr7,vr6,vr16
618 vperm vr10,vr6,vr5,vr16
620 vperm vr11,vr5,vr4,vr16
622 vperm vr12,vr4,vr3,vr16
624 vperm vr13,vr3,vr2,vr16
626 vperm vr14,vr2,vr1,vr16
628 vperm vr15,vr1,vr0,vr16
632 err4; stvx vr10,r3,r10
633 err4; stvx vr11,r3,r11
634 err4; stvx vr12,r3,r12
635 err4; stvx vr13,r3,r14
636 err4; stvx vr14,r3,r15
637 err4; stvx vr15,r3,r16
641 ld r14,STK_REG(R14)(r1)
642 ld r15,STK_REG(R15)(r1)
643 ld r16,STK_REG(R16)(r1)
645 /* Up to 127B to go */
652 vperm vr8,vr0,vr3,vr16
654 vperm vr9,vr3,vr2,vr16
656 vperm vr10,vr2,vr1,vr16
658 vperm vr11,vr1,vr0,vr16
662 err3; stvx vr10,r3,r10
663 err3; stvx vr11,r3,r11
668 vperm vr8,vr0,vr1,vr16
670 vperm vr9,vr1,vr0,vr16
678 vperm vr8,vr0,vr1,vr16
683 /* Up to 15B to go */
684 11: clrldi r5,r5,(64-4)
685 addi r4,r4,-16 /* Unwind the +16 load offset */
688 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
711 15: addi r1,r1,STACKFRAMESIZE
712 b .exit_vmx_usercopy /* tail call optimise */
713 #endif /* CONFiG_ALTIVEC */