Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jikos/trivial
[platform/adaptation/renesas_rcar/renesas_kernel.git] / arch / powerpc / lib / copyuser_power7.S
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 2 of the License, or
5  * (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  * GNU General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * along with this program; if not, write to the Free Software
14  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15  *
16  * Copyright (C) IBM Corporation, 2011
17  *
18  * Author: Anton Blanchard <anton@au.ibm.com>
19  */
20 #include <asm/ppc_asm.h>
21
22         .macro err1
23 100:
24         .section __ex_table,"a"
25         .align 3
26         .llong 100b,.Ldo_err1
27         .previous
28         .endm
29
30         .macro err2
31 200:
32         .section __ex_table,"a"
33         .align 3
34         .llong 200b,.Ldo_err2
35         .previous
36         .endm
37
38 #ifdef CONFIG_ALTIVEC
39         .macro err3
40 300:
41         .section __ex_table,"a"
42         .align 3
43         .llong 300b,.Ldo_err3
44         .previous
45         .endm
46
47         .macro err4
48 400:
49         .section __ex_table,"a"
50         .align 3
51         .llong 400b,.Ldo_err4
52         .previous
53         .endm
54
55
56 .Ldo_err4:
57         ld      r16,STK_REG(R16)(r1)
58         ld      r15,STK_REG(R15)(r1)
59         ld      r14,STK_REG(R14)(r1)
60 .Ldo_err3:
61         bl      .exit_vmx_usercopy
62         ld      r0,STACKFRAMESIZE+16(r1)
63         mtlr    r0
64         b       .Lexit
65 #endif /* CONFIG_ALTIVEC */
66
67 .Ldo_err2:
68         ld      r22,STK_REG(R22)(r1)
69         ld      r21,STK_REG(R21)(r1)
70         ld      r20,STK_REG(R20)(r1)
71         ld      r19,STK_REG(R19)(r1)
72         ld      r18,STK_REG(R18)(r1)
73         ld      r17,STK_REG(R17)(r1)
74         ld      r16,STK_REG(R16)(r1)
75         ld      r15,STK_REG(R15)(r1)
76         ld      r14,STK_REG(R14)(r1)
77 .Lexit:
78         addi    r1,r1,STACKFRAMESIZE
79 .Ldo_err1:
80         ld      r3,48(r1)
81         ld      r4,56(r1)
82         ld      r5,64(r1)
83         b       __copy_tofrom_user_base
84
85
86 _GLOBAL(__copy_tofrom_user_power7)
87 #ifdef CONFIG_ALTIVEC
88         cmpldi  r5,16
89         cmpldi  cr1,r5,4096
90
91         std     r3,48(r1)
92         std     r4,56(r1)
93         std     r5,64(r1)
94
95         blt     .Lshort_copy
96         bgt     cr1,.Lvmx_copy
97 #else
98         cmpldi  r5,16
99
100         std     r3,48(r1)
101         std     r4,56(r1)
102         std     r5,64(r1)
103
104         blt     .Lshort_copy
105 #endif
106
107 .Lnonvmx_copy:
108         /* Get the source 8B aligned */
109         neg     r6,r4
110         mtocrf  0x01,r6
111         clrldi  r6,r6,(64-3)
112
113         bf      cr7*4+3,1f
114 err1;   lbz     r0,0(r4)
115         addi    r4,r4,1
116 err1;   stb     r0,0(r3)
117         addi    r3,r3,1
118
119 1:      bf      cr7*4+2,2f
120 err1;   lhz     r0,0(r4)
121         addi    r4,r4,2
122 err1;   sth     r0,0(r3)
123         addi    r3,r3,2
124
125 2:      bf      cr7*4+1,3f
126 err1;   lwz     r0,0(r4)
127         addi    r4,r4,4
128 err1;   stw     r0,0(r3)
129         addi    r3,r3,4
130
131 3:      sub     r5,r5,r6
132         cmpldi  r5,128
133         blt     5f
134
135         mflr    r0
136         stdu    r1,-STACKFRAMESIZE(r1)
137         std     r14,STK_REG(R14)(r1)
138         std     r15,STK_REG(R15)(r1)
139         std     r16,STK_REG(R16)(r1)
140         std     r17,STK_REG(R17)(r1)
141         std     r18,STK_REG(R18)(r1)
142         std     r19,STK_REG(R19)(r1)
143         std     r20,STK_REG(R20)(r1)
144         std     r21,STK_REG(R21)(r1)
145         std     r22,STK_REG(R22)(r1)
146         std     r0,STACKFRAMESIZE+16(r1)
147
148         srdi    r6,r5,7
149         mtctr   r6
150
151         /* Now do cacheline (128B) sized loads and stores. */
152         .align  5
153 4:
154 err2;   ld      r0,0(r4)
155 err2;   ld      r6,8(r4)
156 err2;   ld      r7,16(r4)
157 err2;   ld      r8,24(r4)
158 err2;   ld      r9,32(r4)
159 err2;   ld      r10,40(r4)
160 err2;   ld      r11,48(r4)
161 err2;   ld      r12,56(r4)
162 err2;   ld      r14,64(r4)
163 err2;   ld      r15,72(r4)
164 err2;   ld      r16,80(r4)
165 err2;   ld      r17,88(r4)
166 err2;   ld      r18,96(r4)
167 err2;   ld      r19,104(r4)
168 err2;   ld      r20,112(r4)
169 err2;   ld      r21,120(r4)
170         addi    r4,r4,128
171 err2;   std     r0,0(r3)
172 err2;   std     r6,8(r3)
173 err2;   std     r7,16(r3)
174 err2;   std     r8,24(r3)
175 err2;   std     r9,32(r3)
176 err2;   std     r10,40(r3)
177 err2;   std     r11,48(r3)
178 err2;   std     r12,56(r3)
179 err2;   std     r14,64(r3)
180 err2;   std     r15,72(r3)
181 err2;   std     r16,80(r3)
182 err2;   std     r17,88(r3)
183 err2;   std     r18,96(r3)
184 err2;   std     r19,104(r3)
185 err2;   std     r20,112(r3)
186 err2;   std     r21,120(r3)
187         addi    r3,r3,128
188         bdnz    4b
189
190         clrldi  r5,r5,(64-7)
191
192         ld      r14,STK_REG(R14)(r1)
193         ld      r15,STK_REG(R15)(r1)
194         ld      r16,STK_REG(R16)(r1)
195         ld      r17,STK_REG(R17)(r1)
196         ld      r18,STK_REG(R18)(r1)
197         ld      r19,STK_REG(R19)(r1)
198         ld      r20,STK_REG(R20)(r1)
199         ld      r21,STK_REG(R21)(r1)
200         ld      r22,STK_REG(R22)(r1)
201         addi    r1,r1,STACKFRAMESIZE
202
203         /* Up to 127B to go */
204 5:      srdi    r6,r5,4
205         mtocrf  0x01,r6
206
207 6:      bf      cr7*4+1,7f
208 err1;   ld      r0,0(r4)
209 err1;   ld      r6,8(r4)
210 err1;   ld      r7,16(r4)
211 err1;   ld      r8,24(r4)
212 err1;   ld      r9,32(r4)
213 err1;   ld      r10,40(r4)
214 err1;   ld      r11,48(r4)
215 err1;   ld      r12,56(r4)
216         addi    r4,r4,64
217 err1;   std     r0,0(r3)
218 err1;   std     r6,8(r3)
219 err1;   std     r7,16(r3)
220 err1;   std     r8,24(r3)
221 err1;   std     r9,32(r3)
222 err1;   std     r10,40(r3)
223 err1;   std     r11,48(r3)
224 err1;   std     r12,56(r3)
225         addi    r3,r3,64
226
227         /* Up to 63B to go */
228 7:      bf      cr7*4+2,8f
229 err1;   ld      r0,0(r4)
230 err1;   ld      r6,8(r4)
231 err1;   ld      r7,16(r4)
232 err1;   ld      r8,24(r4)
233         addi    r4,r4,32
234 err1;   std     r0,0(r3)
235 err1;   std     r6,8(r3)
236 err1;   std     r7,16(r3)
237 err1;   std     r8,24(r3)
238         addi    r3,r3,32
239
240         /* Up to 31B to go */
241 8:      bf      cr7*4+3,9f
242 err1;   ld      r0,0(r4)
243 err1;   ld      r6,8(r4)
244         addi    r4,r4,16
245 err1;   std     r0,0(r3)
246 err1;   std     r6,8(r3)
247         addi    r3,r3,16
248
249 9:      clrldi  r5,r5,(64-4)
250
251         /* Up to 15B to go */
252 .Lshort_copy:
253         mtocrf  0x01,r5
254         bf      cr7*4+0,12f
255 err1;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
256 err1;   lwz     r6,4(r4)
257         addi    r4,r4,8
258 err1;   stw     r0,0(r3)
259 err1;   stw     r6,4(r3)
260         addi    r3,r3,8
261
262 12:     bf      cr7*4+1,13f
263 err1;   lwz     r0,0(r4)
264         addi    r4,r4,4
265 err1;   stw     r0,0(r3)
266         addi    r3,r3,4
267
268 13:     bf      cr7*4+2,14f
269 err1;   lhz     r0,0(r4)
270         addi    r4,r4,2
271 err1;   sth     r0,0(r3)
272         addi    r3,r3,2
273
274 14:     bf      cr7*4+3,15f
275 err1;   lbz     r0,0(r4)
276 err1;   stb     r0,0(r3)
277
278 15:     li      r3,0
279         blr
280
281 .Lunwind_stack_nonvmx_copy:
282         addi    r1,r1,STACKFRAMESIZE
283         b       .Lnonvmx_copy
284
285 #ifdef CONFIG_ALTIVEC
286 .Lvmx_copy:
287         mflr    r0
288         std     r0,16(r1)
289         stdu    r1,-STACKFRAMESIZE(r1)
290         bl      .enter_vmx_usercopy
291         cmpwi   cr1,r3,0
292         ld      r0,STACKFRAMESIZE+16(r1)
293         ld      r3,STACKFRAMESIZE+48(r1)
294         ld      r4,STACKFRAMESIZE+56(r1)
295         ld      r5,STACKFRAMESIZE+64(r1)
296         mtlr    r0
297
298         /*
299          * We prefetch both the source and destination using enhanced touch
300          * instructions. We use a stream ID of 0 for the load side and
301          * 1 for the store side.
302          */
303         clrrdi  r6,r4,7
304         clrrdi  r9,r3,7
305         ori     r9,r9,1         /* stream=1 */
306
307         srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
308         cmpldi  r7,0x3FF
309         ble     1f
310         li      r7,0x3FF
311 1:      lis     r0,0x0E00       /* depth=7 */
312         sldi    r7,r7,7
313         or      r7,r7,r0
314         ori     r10,r7,1        /* stream=1 */
315
316         lis     r8,0x8000       /* GO=1 */
317         clrldi  r8,r8,32
318
319 .machine push
320 .machine "power4"
321         /* setup read stream 0 */
322         dcbt    r0,r6,0b01000   /* addr from */
323         dcbt    r0,r7,0b01010   /* length and depth from */
324         /* setup write stream 1 */
325         dcbtst  r0,r9,0b01000   /* addr to */
326         dcbtst  r0,r10,0b01010  /* length and depth to */
327         eieio
328         dcbt    r0,r8,0b01010   /* all streams GO */
329 .machine pop
330
331         beq     cr1,.Lunwind_stack_nonvmx_copy
332
333         /*
334          * If source and destination are not relatively aligned we use a
335          * slower permute loop.
336          */
337         xor     r6,r4,r3
338         rldicl. r6,r6,0,(64-4)
339         bne     .Lvmx_unaligned_copy
340
341         /* Get the destination 16B aligned */
342         neg     r6,r3
343         mtocrf  0x01,r6
344         clrldi  r6,r6,(64-4)
345
346         bf      cr7*4+3,1f
347 err3;   lbz     r0,0(r4)
348         addi    r4,r4,1
349 err3;   stb     r0,0(r3)
350         addi    r3,r3,1
351
352 1:      bf      cr7*4+2,2f
353 err3;   lhz     r0,0(r4)
354         addi    r4,r4,2
355 err3;   sth     r0,0(r3)
356         addi    r3,r3,2
357
358 2:      bf      cr7*4+1,3f
359 err3;   lwz     r0,0(r4)
360         addi    r4,r4,4
361 err3;   stw     r0,0(r3)
362         addi    r3,r3,4
363
364 3:      bf      cr7*4+0,4f
365 err3;   ld      r0,0(r4)
366         addi    r4,r4,8
367 err3;   std     r0,0(r3)
368         addi    r3,r3,8
369
370 4:      sub     r5,r5,r6
371
372         /* Get the desination 128B aligned */
373         neg     r6,r3
374         srdi    r7,r6,4
375         mtocrf  0x01,r7
376         clrldi  r6,r6,(64-7)
377
378         li      r9,16
379         li      r10,32
380         li      r11,48
381
382         bf      cr7*4+3,5f
383 err3;   lvx     vr1,r0,r4
384         addi    r4,r4,16
385 err3;   stvx    vr1,r0,r3
386         addi    r3,r3,16
387
388 5:      bf      cr7*4+2,6f
389 err3;   lvx     vr1,r0,r4
390 err3;   lvx     vr0,r4,r9
391         addi    r4,r4,32
392 err3;   stvx    vr1,r0,r3
393 err3;   stvx    vr0,r3,r9
394         addi    r3,r3,32
395
396 6:      bf      cr7*4+1,7f
397 err3;   lvx     vr3,r0,r4
398 err3;   lvx     vr2,r4,r9
399 err3;   lvx     vr1,r4,r10
400 err3;   lvx     vr0,r4,r11
401         addi    r4,r4,64
402 err3;   stvx    vr3,r0,r3
403 err3;   stvx    vr2,r3,r9
404 err3;   stvx    vr1,r3,r10
405 err3;   stvx    vr0,r3,r11
406         addi    r3,r3,64
407
408 7:      sub     r5,r5,r6
409         srdi    r6,r5,7
410
411         std     r14,STK_REG(R14)(r1)
412         std     r15,STK_REG(R15)(r1)
413         std     r16,STK_REG(R16)(r1)
414
415         li      r12,64
416         li      r14,80
417         li      r15,96
418         li      r16,112
419
420         mtctr   r6
421
422         /*
423          * Now do cacheline sized loads and stores. By this stage the
424          * cacheline stores are also cacheline aligned.
425          */
426         .align  5
427 8:
428 err4;   lvx     vr7,r0,r4
429 err4;   lvx     vr6,r4,r9
430 err4;   lvx     vr5,r4,r10
431 err4;   lvx     vr4,r4,r11
432 err4;   lvx     vr3,r4,r12
433 err4;   lvx     vr2,r4,r14
434 err4;   lvx     vr1,r4,r15
435 err4;   lvx     vr0,r4,r16
436         addi    r4,r4,128
437 err4;   stvx    vr7,r0,r3
438 err4;   stvx    vr6,r3,r9
439 err4;   stvx    vr5,r3,r10
440 err4;   stvx    vr4,r3,r11
441 err4;   stvx    vr3,r3,r12
442 err4;   stvx    vr2,r3,r14
443 err4;   stvx    vr1,r3,r15
444 err4;   stvx    vr0,r3,r16
445         addi    r3,r3,128
446         bdnz    8b
447
448         ld      r14,STK_REG(R14)(r1)
449         ld      r15,STK_REG(R15)(r1)
450         ld      r16,STK_REG(R16)(r1)
451
452         /* Up to 127B to go */
453         clrldi  r5,r5,(64-7)
454         srdi    r6,r5,4
455         mtocrf  0x01,r6
456
457         bf      cr7*4+1,9f
458 err3;   lvx     vr3,r0,r4
459 err3;   lvx     vr2,r4,r9
460 err3;   lvx     vr1,r4,r10
461 err3;   lvx     vr0,r4,r11
462         addi    r4,r4,64
463 err3;   stvx    vr3,r0,r3
464 err3;   stvx    vr2,r3,r9
465 err3;   stvx    vr1,r3,r10
466 err3;   stvx    vr0,r3,r11
467         addi    r3,r3,64
468
469 9:      bf      cr7*4+2,10f
470 err3;   lvx     vr1,r0,r4
471 err3;   lvx     vr0,r4,r9
472         addi    r4,r4,32
473 err3;   stvx    vr1,r0,r3
474 err3;   stvx    vr0,r3,r9
475         addi    r3,r3,32
476
477 10:     bf      cr7*4+3,11f
478 err3;   lvx     vr1,r0,r4
479         addi    r4,r4,16
480 err3;   stvx    vr1,r0,r3
481         addi    r3,r3,16
482
483         /* Up to 15B to go */
484 11:     clrldi  r5,r5,(64-4)
485         mtocrf  0x01,r5
486         bf      cr7*4+0,12f
487 err3;   ld      r0,0(r4)
488         addi    r4,r4,8
489 err3;   std     r0,0(r3)
490         addi    r3,r3,8
491
492 12:     bf      cr7*4+1,13f
493 err3;   lwz     r0,0(r4)
494         addi    r4,r4,4
495 err3;   stw     r0,0(r3)
496         addi    r3,r3,4
497
498 13:     bf      cr7*4+2,14f
499 err3;   lhz     r0,0(r4)
500         addi    r4,r4,2
501 err3;   sth     r0,0(r3)
502         addi    r3,r3,2
503
504 14:     bf      cr7*4+3,15f
505 err3;   lbz     r0,0(r4)
506 err3;   stb     r0,0(r3)
507
508 15:     addi    r1,r1,STACKFRAMESIZE
509         b       .exit_vmx_usercopy      /* tail call optimise */
510
511 .Lvmx_unaligned_copy:
512         /* Get the destination 16B aligned */
513         neg     r6,r3
514         mtocrf  0x01,r6
515         clrldi  r6,r6,(64-4)
516
517         bf      cr7*4+3,1f
518 err3;   lbz     r0,0(r4)
519         addi    r4,r4,1
520 err3;   stb     r0,0(r3)
521         addi    r3,r3,1
522
523 1:      bf      cr7*4+2,2f
524 err3;   lhz     r0,0(r4)
525         addi    r4,r4,2
526 err3;   sth     r0,0(r3)
527         addi    r3,r3,2
528
529 2:      bf      cr7*4+1,3f
530 err3;   lwz     r0,0(r4)
531         addi    r4,r4,4
532 err3;   stw     r0,0(r3)
533         addi    r3,r3,4
534
535 3:      bf      cr7*4+0,4f
536 err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
537 err3;   lwz     r7,4(r4)
538         addi    r4,r4,8
539 err3;   stw     r0,0(r3)
540 err3;   stw     r7,4(r3)
541         addi    r3,r3,8
542
543 4:      sub     r5,r5,r6
544
545         /* Get the desination 128B aligned */
546         neg     r6,r3
547         srdi    r7,r6,4
548         mtocrf  0x01,r7
549         clrldi  r6,r6,(64-7)
550
551         li      r9,16
552         li      r10,32
553         li      r11,48
554
555         lvsl    vr16,0,r4       /* Setup permute control vector */
556 err3;   lvx     vr0,0,r4
557         addi    r4,r4,16
558
559         bf      cr7*4+3,5f
560 err3;   lvx     vr1,r0,r4
561         vperm   vr8,vr0,vr1,vr16
562         addi    r4,r4,16
563 err3;   stvx    vr8,r0,r3
564         addi    r3,r3,16
565         vor     vr0,vr1,vr1
566
567 5:      bf      cr7*4+2,6f
568 err3;   lvx     vr1,r0,r4
569         vperm   vr8,vr0,vr1,vr16
570 err3;   lvx     vr0,r4,r9
571         vperm   vr9,vr1,vr0,vr16
572         addi    r4,r4,32
573 err3;   stvx    vr8,r0,r3
574 err3;   stvx    vr9,r3,r9
575         addi    r3,r3,32
576
577 6:      bf      cr7*4+1,7f
578 err3;   lvx     vr3,r0,r4
579         vperm   vr8,vr0,vr3,vr16
580 err3;   lvx     vr2,r4,r9
581         vperm   vr9,vr3,vr2,vr16
582 err3;   lvx     vr1,r4,r10
583         vperm   vr10,vr2,vr1,vr16
584 err3;   lvx     vr0,r4,r11
585         vperm   vr11,vr1,vr0,vr16
586         addi    r4,r4,64
587 err3;   stvx    vr8,r0,r3
588 err3;   stvx    vr9,r3,r9
589 err3;   stvx    vr10,r3,r10
590 err3;   stvx    vr11,r3,r11
591         addi    r3,r3,64
592
593 7:      sub     r5,r5,r6
594         srdi    r6,r5,7
595
596         std     r14,STK_REG(R14)(r1)
597         std     r15,STK_REG(R15)(r1)
598         std     r16,STK_REG(R16)(r1)
599
600         li      r12,64
601         li      r14,80
602         li      r15,96
603         li      r16,112
604
605         mtctr   r6
606
607         /*
608          * Now do cacheline sized loads and stores. By this stage the
609          * cacheline stores are also cacheline aligned.
610          */
611         .align  5
612 8:
613 err4;   lvx     vr7,r0,r4
614         vperm   vr8,vr0,vr7,vr16
615 err4;   lvx     vr6,r4,r9
616         vperm   vr9,vr7,vr6,vr16
617 err4;   lvx     vr5,r4,r10
618         vperm   vr10,vr6,vr5,vr16
619 err4;   lvx     vr4,r4,r11
620         vperm   vr11,vr5,vr4,vr16
621 err4;   lvx     vr3,r4,r12
622         vperm   vr12,vr4,vr3,vr16
623 err4;   lvx     vr2,r4,r14
624         vperm   vr13,vr3,vr2,vr16
625 err4;   lvx     vr1,r4,r15
626         vperm   vr14,vr2,vr1,vr16
627 err4;   lvx     vr0,r4,r16
628         vperm   vr15,vr1,vr0,vr16
629         addi    r4,r4,128
630 err4;   stvx    vr8,r0,r3
631 err4;   stvx    vr9,r3,r9
632 err4;   stvx    vr10,r3,r10
633 err4;   stvx    vr11,r3,r11
634 err4;   stvx    vr12,r3,r12
635 err4;   stvx    vr13,r3,r14
636 err4;   stvx    vr14,r3,r15
637 err4;   stvx    vr15,r3,r16
638         addi    r3,r3,128
639         bdnz    8b
640
641         ld      r14,STK_REG(R14)(r1)
642         ld      r15,STK_REG(R15)(r1)
643         ld      r16,STK_REG(R16)(r1)
644
645         /* Up to 127B to go */
646         clrldi  r5,r5,(64-7)
647         srdi    r6,r5,4
648         mtocrf  0x01,r6
649
650         bf      cr7*4+1,9f
651 err3;   lvx     vr3,r0,r4
652         vperm   vr8,vr0,vr3,vr16
653 err3;   lvx     vr2,r4,r9
654         vperm   vr9,vr3,vr2,vr16
655 err3;   lvx     vr1,r4,r10
656         vperm   vr10,vr2,vr1,vr16
657 err3;   lvx     vr0,r4,r11
658         vperm   vr11,vr1,vr0,vr16
659         addi    r4,r4,64
660 err3;   stvx    vr8,r0,r3
661 err3;   stvx    vr9,r3,r9
662 err3;   stvx    vr10,r3,r10
663 err3;   stvx    vr11,r3,r11
664         addi    r3,r3,64
665
666 9:      bf      cr7*4+2,10f
667 err3;   lvx     vr1,r0,r4
668         vperm   vr8,vr0,vr1,vr16
669 err3;   lvx     vr0,r4,r9
670         vperm   vr9,vr1,vr0,vr16
671         addi    r4,r4,32
672 err3;   stvx    vr8,r0,r3
673 err3;   stvx    vr9,r3,r9
674         addi    r3,r3,32
675
676 10:     bf      cr7*4+3,11f
677 err3;   lvx     vr1,r0,r4
678         vperm   vr8,vr0,vr1,vr16
679         addi    r4,r4,16
680 err3;   stvx    vr8,r0,r3
681         addi    r3,r3,16
682
683         /* Up to 15B to go */
684 11:     clrldi  r5,r5,(64-4)
685         addi    r4,r4,-16       /* Unwind the +16 load offset */
686         mtocrf  0x01,r5
687         bf      cr7*4+0,12f
688 err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
689 err3;   lwz     r6,4(r4)
690         addi    r4,r4,8
691 err3;   stw     r0,0(r3)
692 err3;   stw     r6,4(r3)
693         addi    r3,r3,8
694
695 12:     bf      cr7*4+1,13f
696 err3;   lwz     r0,0(r4)
697         addi    r4,r4,4
698 err3;   stw     r0,0(r3)
699         addi    r3,r3,4
700
701 13:     bf      cr7*4+2,14f
702 err3;   lhz     r0,0(r4)
703         addi    r4,r4,2
704 err3;   sth     r0,0(r3)
705         addi    r3,r3,2
706
707 14:     bf      cr7*4+3,15f
708 err3;   lbz     r0,0(r4)
709 err3;   stb     r0,0(r3)
710
711 15:     addi    r1,r1,STACKFRAMESIZE
712         b       .exit_vmx_usercopy      /* tail call optimise */
713 #endif /* CONFiG_ALTIVEC */