1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
66 #if defined(_AIX) || defined(__APPLE__)
67 #if !defined(__64BIT__) && defined(DOUBLE)
145 #define PREFETCHSIZE_A 24
148 #if defined(PPC440) || defined(PPC440FP2)
149 #define PREFETCHSIZE_A 24
153 #define PREFETCHSIZE_A 32
157 #define PREFETCHSIZE_A 72
161 #define PREFETCHSIZE_A 16
165 #define PREFETCHSIZE_A 96
169 #define PREFETCHSIZE_A 112
172 #if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
176 #define NOP1 mr LDA, LDA
177 #define NOP2 mr INCX, INCX
183 #define STACKSIZE 224
184 #define ALPHA_R 200(SP)
185 #define ALPHA_I 208(SP)
186 #define FZERO 216(SP)
188 #define STACKSIZE 280
189 #define ALPHA_R 256(SP)
190 #define ALPHA_I 264(SP)
191 #define FZERO 272(SP)
195 #define FMADD1 FNMSUB
199 #define FMADD2 FNMSUB
205 addi SP, SP, -STACKSIZE
264 lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
266 ld Y, FRAMESLOT(0) + STACKSIZE(SP)
267 ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
268 ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
272 #if defined(_AIX) || defined(__APPLE__)
275 lwz X, FRAMESLOT(0) + STACKSIZE(SP)
276 lwz INCX, FRAMESLOT(1) + STACKSIZE(SP)
277 lwz Y, FRAMESLOT(2) + STACKSIZE(SP)
278 lwz INCY, FRAMESLOT(3) + STACKSIZE(SP)
279 lwz BUFFER, FRAMESLOT(4) + STACKSIZE(SP)
281 lwz Y, FRAMESLOT(0) + STACKSIZE(SP)
282 lwz INCY, FRAMESLOT(1) + STACKSIZE(SP)
283 lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
286 ld Y, FRAMESLOT(0) + STACKSIZE(SP)
287 ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
288 ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
292 STFD alpha_r, ALPHA_R
293 STFD alpha_i, ALPHA_I
295 slwi LDA, LDA, ZBASE_SHIFT
296 slwi INCX, INCX, ZBASE_SHIFT
297 slwi INCY, INCY, ZBASE_SHIFT
299 li PREA, PREFETCHSIZE_A * SIZE
304 cmpwi cr0, INCX, 2 * SIZE
332 STFD a1, 0 * SIZE(BUFFER)
333 STFD a2, 1 * SIZE(BUFFER)
334 STFD a3, 2 * SIZE(BUFFER)
335 STFD a4, 3 * SIZE(BUFFER)
336 STFD a5, 4 * SIZE(BUFFER)
337 STFD a6, 5 * SIZE(BUFFER)
338 STFD a7, 6 * SIZE(BUFFER)
339 STFD a8, 7 * SIZE(BUFFER)
341 addi BUFFER, BUFFER, 8 * SIZE
356 STFD a1, 0 * SIZE(BUFFER)
357 STFD a2, 1 * SIZE(BUFFER)
359 addi BUFFER, BUFFER, 2 * SIZE
367 cmpwi cr0, INCY, 2 * SIZE
378 STFD f0, 0 * SIZE(BUFFER)
379 STFD f0, 1 * SIZE(BUFFER)
380 STFD f0, 2 * SIZE(BUFFER)
381 STFD f0, 3 * SIZE(BUFFER)
382 STFD f0, 4 * SIZE(BUFFER)
383 STFD f0, 5 * SIZE(BUFFER)
384 STFD f0, 6 * SIZE(BUFFER)
385 STFD f0, 7 * SIZE(BUFFER)
386 addi BUFFER, BUFFER, 8 * SIZE
399 slwi TEMP, IS, ZBASE_SHIFT
411 LFD atemp1, 0 * SIZE(XX)
412 LFD atemp2, 1 * SIZE(XX)
413 LFD atemp3, 2 * SIZE(XX)
414 LFD atemp4, 3 * SIZE(XX)
416 LFD a1, 0 * SIZE(AO1)
417 LFD a2, 1 * SIZE(AO1)
418 LFD a3, 2 * SIZE(AO1)
419 LFD a4, 3 * SIZE(AO1)
421 LFD a7, 2 * SIZE(AO2)
422 LFD a8, 3 * SIZE(AO2)
424 FMUL xsum1, atemp1, a1
425 addi AO2, AO2, 4 * SIZE
426 FMUL xsum2, atemp2, a1
427 LFD a1, 4 * SIZE(AO1)
428 FMUL xsum3, atemp1, a3
429 addi AO1, AO1, 4 * SIZE
430 FMUL xsum4, atemp2, a3
431 LFD a5, 0 * SIZE(AO2)
434 FNMSUB xsum1, atemp2, a2, xsum1
436 addi XX, XX, 4 * SIZE
438 FMADD xsum2, atemp1, a2, xsum2
440 LFD a2, 1 * SIZE(AO1)
441 FNMSUB xsum3, atemp2, a4, xsum3
442 addi YY, YY, 4 * SIZE
443 FMADD xsum4, atemp1, a4, xsum4
444 LFD a6, 1 * SIZE(AO2)
446 FMADD xsum1, atemp3, a3, xsum1
448 FMADD xsum2, atemp4, a3, xsum2
449 LFD a3, 2 * SIZE(AO1)
450 FMADD xsum3, atemp3, a7, xsum3
452 FMADD xsum4, atemp4, a7, xsum4
453 LFD a7, 2 * SIZE(AO2)
455 FMADD1 xsum1, atemp4, a4, xsum1
457 FMADD2 xsum2, atemp3, a4, xsum2
458 LFD a4, 3 * SIZE(AO1)
460 FMADD1 xsum3, atemp4, a8, xsum3
464 FMADD2 xsum4, atemp3, a8, xsum4
466 LFD a8, 3 * SIZE(AO2)
468 FMUL xtemp1, y05, atemp1
469 LFD y01, 0 * SIZE(YY)
470 FMUL xtemp2, y06, atemp1
471 LFD y02, 1 * SIZE(YY)
472 FMUL xtemp3, y05, atemp3
473 LFD y03, 2 * SIZE(YY)
474 FMUL xtemp4, y06, atemp3
475 LFD y04, 3 * SIZE(YY)
477 FNMSUB atemp1, y06, atemp2, xtemp1
478 LFD xtemp1, 0 * SIZE(XX)
479 FMADD atemp2, y05, atemp2, xtemp2
480 LFD xtemp2, 1 * SIZE(XX)
481 FNMSUB atemp3, y06, atemp4, xtemp3
482 LFD xtemp3, 2 * SIZE(XX)
483 FMADD atemp4, y05, atemp4, xtemp4
484 LFD xtemp4, 3 * SIZE(XX)
489 FMADD xsum1, xtemp1, a1, xsum1
491 FMADD y01, atemp1, a1, y01
494 FMADD xsum2, xtemp2, a1, xsum2
496 FMADD y02, atemp2, a1, y02
497 LFD a1, 4 * SIZE(AO1)
499 FMADD xsum3, xtemp1, a5, xsum3
501 FMADD y03, atemp1, a3, y03
504 FMADD xsum4, xtemp2, a5, xsum4
506 FMADD y04, atemp2, a3, y04
509 FMADD1 xsum1, xtemp2, a2, xsum1
510 LFD y05, 4 * SIZE(YY)
511 FNMSUB y01, atemp2, a2, y01
514 FMADD2 xsum2, xtemp1, a2, xsum2
515 LFD y06, 5 * SIZE(YY)
516 FMADD y02, atemp1, a2, y02
517 LFD a2, 5 * SIZE(AO1)
519 FMADD1 xsum3, xtemp2, a6, xsum3
520 LFD xtemp2, 5 * SIZE(XX)
521 FNMSUB y03, atemp2, a4, y03
524 FMADD2 xsum4, xtemp1, a6, xsum4
525 LFD xtemp1, 4 * SIZE(XX)
526 FMADD y04, atemp1, a4, y04
529 FMADD xsum1, xtemp3, a3, xsum1
530 LFD y07, 6 * SIZE(YY)
531 FMADD y01, atemp3, a5, y01
534 FMADD xsum2, xtemp4, a3, xsum2
535 LFD a3, 6 * SIZE(AO1)
536 FMADD y02, atemp4, a5, y02
537 LFD a5, 4 * SIZE(AO2)
539 FMADD xsum3, xtemp3, a7, xsum3
540 LFD y08, 7 * SIZE(YY)
541 FMADD y03, atemp3, a7, y03
544 FMADD xsum4, xtemp4, a7, xsum4
546 FMADD y04, atemp4, a7, y04
547 LFD a7, 6 * SIZE(AO2)
549 FMADD1 xsum1, xtemp4, a4, xsum1
551 FNMSUB y01, atemp4, a6, y01
555 FMADD2 xsum2, xtemp3, a4, xsum2
556 LFD a4, 7 * SIZE(AO1)
557 FMADD y02, atemp3, a6, y02
558 LFD a6, 5 * SIZE(AO2)
560 FMADD1 xsum3, xtemp4, a8, xsum3
561 LFD xtemp4, 7 * SIZE(XX)
562 FNMSUB y03, atemp4, a8, y03
565 FMADD2 xsum4, xtemp3, a8, xsum4
566 LFD xtemp3, 6 * SIZE(XX)
567 FMADD y04, atemp3, a8, y04
568 LFD a8, 7 * SIZE(AO2)
570 FMADD xsum1, xtemp1, a1, xsum1
571 STFD y01, 0 * SIZE(YY)
572 FMADD y05, atemp1, a1, y05
575 FMADD xsum2, xtemp2, a1, xsum2
576 STFD y02, 1 * SIZE(YY)
577 FMADD y06, atemp2, a1, y06
578 LFD a1, 8 * SIZE(AO1)
580 FMADD xsum3, xtemp1, a5, xsum3
581 STFD y03, 2 * SIZE(YY)
582 FMADD y07, atemp1, a3, y07
585 FMADD xsum4, xtemp2, a5, xsum4
586 STFD y04, 3 * SIZE(YY)
587 FMADD y08, atemp2, a3, y08
590 FMADD1 xsum1, xtemp2, a2, xsum1
591 LFD y01, 8 * SIZE(YY)
592 FNMSUB y05, atemp2, a2, y05
595 FMADD2 xsum2, xtemp1, a2, xsum2
596 LFD y02, 9 * SIZE(YY)
597 FMADD y06, atemp1, a2, y06
598 LFD a2, 9 * SIZE(AO1)
600 FMADD1 xsum3, xtemp2, a6, xsum3
601 LFD xtemp2, 9 * SIZE(XX)
602 FNMSUB y07, atemp2, a4, y07
605 FMADD2 xsum4, xtemp1, a6, xsum4
606 LFD xtemp1, 8 * SIZE(XX)
607 FMADD y08, atemp1, a4, y08
610 FMADD xsum1, xtemp3, a3, xsum1
611 LFD y03, 10 * SIZE(YY)
612 FMADD y05, atemp3, a5, y05
615 FMADD xsum2, xtemp4, a3, xsum2
616 LFD a3, 10 * SIZE(AO1)
617 FMADD y06, atemp4, a5, y06
618 LFD a5, 8 * SIZE(AO2)
620 FMADD xsum3, xtemp3, a7, xsum3
621 LFD y04, 11 * SIZE(YY)
622 FMADD y07, atemp3, a7, y07
625 FMADD xsum4, xtemp4, a7, xsum4
627 FMADD y08, atemp4, a7, y08
628 LFD a7, 10 * SIZE(AO2)
630 FMADD1 xsum1, xtemp4, a4, xsum1
632 FNMSUB y05, atemp4, a6, y05
635 FMADD2 xsum2, xtemp3, a4, xsum2
636 LFD a4, 11 * SIZE(AO1)
637 FMADD y06, atemp3, a6, y06
638 LFD a6, 9 * SIZE(AO2)
640 FMADD1 xsum3, xtemp4, a8, xsum3
641 LFD xtemp4, 11 * SIZE(XX)
642 FNMSUB y07, atemp4, a8, y07
647 FMADD2 xsum4, xtemp3, a8, xsum4
648 LFD xtemp3, 10 * SIZE(XX)
649 FMADD y08, atemp3, a8, y08
650 LFD a8, 11 * SIZE(AO2)
652 FMADD xsum1, xtemp1, a1, xsum1
653 STFD y05, 4 * SIZE(YY)
654 FMADD y01, atemp1, a1, y01
657 FMADD xsum2, xtemp2, a1, xsum2
658 STFD y06, 5 * SIZE(YY)
659 FMADD y02, atemp2, a1, y02
660 LFD a1, 12 * SIZE(AO1)
662 FMADD xsum3, xtemp1, a5, xsum3
663 STFD y07, 6 * SIZE(YY)
664 FMADD y03, atemp1, a3, y03
667 FMADD xsum4, xtemp2, a5, xsum4
668 STFD y08, 7 * SIZE(YY)
669 FMADD y04, atemp2, a3, y04
672 FMADD1 xsum1, xtemp2, a2, xsum1
673 LFD y05, 12 * SIZE(YY)
674 FNMSUB y01, atemp2, a2, y01
677 FMADD2 xsum2, xtemp1, a2, xsum2
678 LFD y06, 13 * SIZE(YY)
679 FMADD y02, atemp1, a2, y02
680 LFD a2, 13 * SIZE(AO1)
682 FMADD1 xsum3, xtemp2, a6, xsum3
683 LFD xtemp2, 13 * SIZE(XX)
684 FNMSUB y03, atemp2, a4, y03
687 FMADD2 xsum4, xtemp1, a6, xsum4
688 LFD xtemp1, 12 * SIZE(XX)
689 FMADD y04, atemp1, a4, y04
692 FMADD xsum1, xtemp3, a3, xsum1
693 LFD y07, 14 * SIZE(YY)
694 FMADD y01, atemp3, a5, y01
697 FMADD xsum2, xtemp4, a3, xsum2
698 LFD a3, 14 * SIZE(AO1)
699 FMADD y02, atemp4, a5, y02
700 LFD a5, 12 * SIZE(AO2)
702 FMADD xsum3, xtemp3, a7, xsum3
703 LFD y08, 15 * SIZE(YY)
704 FMADD y03, atemp3, a7, y03
707 FMADD xsum4, xtemp4, a7, xsum4
709 FMADD y04, atemp4, a7, y04
710 LFD a7, 14 * SIZE(AO2)
712 FMADD1 xsum1, xtemp4, a4, xsum1
714 FNMSUB y01, atemp4, a6, y01
718 FMADD2 xsum2, xtemp3, a4, xsum2
719 LFD a4, 15 * SIZE(AO1)
720 FMADD y02, atemp3, a6, y02
721 LFD a6, 13 * SIZE(AO2)
723 FMADD1 xsum3, xtemp4, a8, xsum3
724 LFD xtemp4, 15 * SIZE(XX)
725 FNMSUB y03, atemp4, a8, y03
728 FMADD2 xsum4, xtemp3, a8, xsum4
729 LFD xtemp3, 14 * SIZE(XX)
730 FMADD y04, atemp3, a8, y04
731 LFD a8, 15 * SIZE(AO2)
733 FMADD xsum1, xtemp1, a1, xsum1
734 STFD y01, 8 * SIZE(YY)
735 FMADD y05, atemp1, a1, y05
738 FMADD xsum2, xtemp2, a1, xsum2
739 STFD y02, 9 * SIZE(YY)
740 FMADD y06, atemp2, a1, y06
741 LFD a1, 16 * SIZE(AO1)
743 FMADD xsum3, xtemp1, a5, xsum3
744 STFD y03, 10 * SIZE(YY)
745 FMADD y07, atemp1, a3, y07
748 FMADD xsum4, xtemp2, a5, xsum4
749 STFD y04, 11 * SIZE(YY)
750 FMADD y08, atemp2, a3, y08
753 FMADD1 xsum1, xtemp2, a2, xsum1
754 LFD y01, 16 * SIZE(YY)
755 FNMSUB y05, atemp2, a2, y05
758 FMADD2 xsum2, xtemp1, a2, xsum2
759 LFD y02, 17 * SIZE(YY)
760 FMADD y06, atemp1, a2, y06
761 LFD a2, 17 * SIZE(AO1)
763 FMADD1 xsum3, xtemp2, a6, xsum3
764 LFD xtemp2, 17 * SIZE(XX)
765 FNMSUB y07, atemp2, a4, y07
768 FMADD2 xsum4, xtemp1, a6, xsum4
769 LFD xtemp1, 16 * SIZE(XX)
770 FMADD y08, atemp1, a4, y08
771 addi AO2, AO2, 16 * SIZE
773 FMADD xsum1, xtemp3, a3, xsum1
774 LFD y03, 18 * SIZE(YY)
775 FMADD y05, atemp3, a5, y05
776 addi XX, XX, 16 * SIZE
778 FMADD xsum2, xtemp4, a3, xsum2
779 LFD a3, 18 * SIZE(AO1)
780 FMADD y06, atemp4, a5, y06
781 LFD a5, 0 * SIZE(AO2)
783 FMADD xsum3, xtemp3, a7, xsum3
784 LFD y04, 19 * SIZE(YY)
785 FMADD y07, atemp3, a7, y07
788 FMADD xsum4, xtemp4, a7, xsum4
789 addi AO1, AO1, 16 * SIZE
790 FMADD y08, atemp4, a7, y08
791 LFD a7, 2 * SIZE(AO2)
793 FMADD1 xsum1, xtemp4, a4, xsum1
794 addi YY, YY, 16 * SIZE
795 FNMSUB y05, atemp4, a6, y05
798 FMADD2 xsum2, xtemp3, a4, xsum2
799 LFD a4, 3 * SIZE(AO1)
800 FMADD y06, atemp3, a6, y06
801 LFD a6, 1 * SIZE(AO2)
803 FMADD1 xsum3, xtemp4, a8, xsum3
804 LFD xtemp4, 3 * SIZE(XX)
805 FNMSUB y07, atemp4, a8, y07
808 FMADD2 xsum4, xtemp3, a8, xsum4
809 LFD xtemp3, 2 * SIZE(XX)
810 FMADD y08, atemp3, a8, y08
811 LFD a8, 3 * SIZE(AO2)
813 FMADD xsum1, xtemp1, a1, xsum1
814 STFD y05, -4 * SIZE(YY)
815 FMADD y01, atemp1, a1, y01
818 FMADD xsum2, xtemp2, a1, xsum2
819 STFD y06, -3 * SIZE(YY)
820 FMADD y02, atemp2, a1, y02
821 LFD a1, 4 * SIZE(AO1)
823 FMADD xsum3, xtemp1, a5, xsum3
824 STFD y07, -2 * SIZE(YY)
825 FMADD y03, atemp1, a3, y03
828 FMADD xsum4, xtemp2, a5, xsum4
829 STFD y08, -1 * SIZE(YY)
830 FMADD y04, atemp2, a3, y04
833 FMADD1 xsum1, xtemp2, a2, xsum1
834 LFD y05, 4 * SIZE(YY)
835 FNMSUB y01, atemp2, a2, y01
838 FMADD2 xsum2, xtemp1, a2, xsum2
839 LFD y06, 5 * SIZE(YY)
840 FMADD y02, atemp1, a2, y02
841 LFD a2, 5 * SIZE(AO1)
843 FMADD1 xsum3, xtemp2, a6, xsum3
844 LFD xtemp2, 5 * SIZE(XX)
845 FNMSUB y03, atemp2, a4, y03
848 FMADD2 xsum4, xtemp1, a6, xsum4
849 LFD xtemp1, 4 * SIZE(XX)
850 FMADD y04, atemp1, a4, y04
853 FMADD xsum1, xtemp3, a3, xsum1
854 LFD y07, 6 * SIZE(YY)
855 FMADD y01, atemp3, a5, y01
858 FMADD xsum2, xtemp4, a3, xsum2
859 LFD a3, 6 * SIZE(AO1)
860 FMADD y02, atemp4, a5, y02
861 LFD a5, 4 * SIZE(AO2)
863 FMADD xsum3, xtemp3, a7, xsum3
864 LFD y08, 7 * SIZE(YY)
865 FMADD y03, atemp3, a7, y03
868 FMADD xsum4, xtemp4, a7, xsum4
870 FMADD y04, atemp4, a7, y04
871 LFD a7, 6 * SIZE(AO2)
873 FMADD1 xsum1, xtemp4, a4, xsum1
875 FNMSUB y01, atemp4, a6, y01
879 FMADD2 xsum2, xtemp3, a4, xsum2
880 LFD a4, 7 * SIZE(AO1)
881 FMADD y02, atemp3, a6, y02
882 LFD a6, 5 * SIZE(AO2)
884 FMADD1 xsum3, xtemp4, a8, xsum3
885 LFD xtemp4, 7 * SIZE(XX)
886 FNMSUB y03, atemp4, a8, y03
889 FMADD2 xsum4, xtemp3, a8, xsum4
890 LFD xtemp3, 6 * SIZE(XX)
891 FMADD y04, atemp3, a8, y04
892 LFD a8, 7 * SIZE(AO2)
894 FMADD xsum1, xtemp1, a1, xsum1
895 STFD y01, 0 * SIZE(YY)
896 FMADD y05, atemp1, a1, y05
899 FMADD xsum2, xtemp2, a1, xsum2
900 STFD y02, 1 * SIZE(YY)
901 FMADD y06, atemp2, a1, y06
902 LFD a1, 8 * SIZE(AO1)
904 FMADD xsum3, xtemp1, a5, xsum3
905 STFD y03, 2 * SIZE(YY)
906 FMADD y07, atemp1, a3, y07
909 FMADD xsum4, xtemp2, a5, xsum4
910 STFD y04, 3 * SIZE(YY)
911 FMADD y08, atemp2, a3, y08
914 FMADD1 xsum1, xtemp2, a2, xsum1
915 LFD y01, 8 * SIZE(YY)
916 FNMSUB y05, atemp2, a2, y05
919 FMADD2 xsum2, xtemp1, a2, xsum2
920 LFD y02, 9 * SIZE(YY)
921 FMADD y06, atemp1, a2, y06
922 LFD a2, 9 * SIZE(AO1)
924 FMADD1 xsum3, xtemp2, a6, xsum3
925 LFD xtemp2, 9 * SIZE(XX)
926 FNMSUB y07, atemp2, a4, y07
929 FMADD2 xsum4, xtemp1, a6, xsum4
930 LFD xtemp1, 8 * SIZE(XX)
931 FMADD y08, atemp1, a4, y08
934 FMADD xsum1, xtemp3, a3, xsum1
935 LFD y03, 10 * SIZE(YY)
936 FMADD y05, atemp3, a5, y05
939 FMADD xsum2, xtemp4, a3, xsum2
940 LFD a3, 10 * SIZE(AO1)
941 FMADD y06, atemp4, a5, y06
942 LFD a5, 8 * SIZE(AO2)
944 FMADD xsum3, xtemp3, a7, xsum3
945 LFD y04, 11 * SIZE(YY)
946 FMADD y07, atemp3, a7, y07
949 FMADD xsum4, xtemp4, a7, xsum4
951 FMADD y08, atemp4, a7, y08
952 LFD a7, 10 * SIZE(AO2)
954 FMADD1 xsum1, xtemp4, a4, xsum1
956 FNMSUB y05, atemp4, a6, y05
959 FMADD2 xsum2, xtemp3, a4, xsum2
960 LFD a4, 11 * SIZE(AO1)
961 FMADD y06, atemp3, a6, y06
962 LFD a6, 9 * SIZE(AO2)
964 FMADD1 xsum3, xtemp4, a8, xsum3
965 LFD xtemp4, 11 * SIZE(XX)
966 FNMSUB y07, atemp4, a8, y07
971 FMADD2 xsum4, xtemp3, a8, xsum4
972 LFD xtemp3, 10 * SIZE(XX)
973 FMADD y08, atemp3, a8, y08
974 LFD a8, 11 * SIZE(AO2)
976 FMADD xsum1, xtemp1, a1, xsum1
977 STFD y05, 4 * SIZE(YY)
978 FMADD y01, atemp1, a1, y01
981 FMADD xsum2, xtemp2, a1, xsum2
982 STFD y06, 5 * SIZE(YY)
983 FMADD y02, atemp2, a1, y02
984 LFD a1, 12 * SIZE(AO1)
986 FMADD xsum3, xtemp1, a5, xsum3
987 STFD y07, 6 * SIZE(YY)
988 FMADD y03, atemp1, a3, y03
991 FMADD xsum4, xtemp2, a5, xsum4
992 STFD y08, 7 * SIZE(YY)
993 FMADD y04, atemp2, a3, y04
996 FMADD1 xsum1, xtemp2, a2, xsum1
997 LFD y05, 12 * SIZE(YY)
998 FNMSUB y01, atemp2, a2, y01
1001 FMADD2 xsum2, xtemp1, a2, xsum2
1002 LFD y06, 13 * SIZE(YY)
1003 FMADD y02, atemp1, a2, y02
1004 LFD a2, 13 * SIZE(AO1)
1006 FMADD1 xsum3, xtemp2, a6, xsum3
1007 LFD xtemp2, 13 * SIZE(XX)
1008 FNMSUB y03, atemp2, a4, y03
1011 FMADD2 xsum4, xtemp1, a6, xsum4
1012 LFD xtemp1, 12 * SIZE(XX)
1013 FMADD y04, atemp1, a4, y04
1016 FMADD xsum1, xtemp3, a3, xsum1
1017 LFD y07, 14 * SIZE(YY)
1018 FMADD y01, atemp3, a5, y01
1021 FMADD xsum2, xtemp4, a3, xsum2
1022 LFD a3, 14 * SIZE(AO1)
1023 FMADD y02, atemp4, a5, y02
1024 LFD a5, 12 * SIZE(AO2)
1026 FMADD xsum3, xtemp3, a7, xsum3
1027 LFD y08, 15 * SIZE(YY)
1028 FMADD y03, atemp3, a7, y03
1031 FMADD xsum4, xtemp4, a7, xsum4
1033 FMADD y04, atemp4, a7, y04
1034 LFD a7, 14 * SIZE(AO2)
1036 FMADD1 xsum1, xtemp4, a4, xsum1
1038 FNMSUB y01, atemp4, a6, y01
1041 FMADD2 xsum2, xtemp3, a4, xsum2
1042 LFD a4, 15 * SIZE(AO1)
1043 FMADD y02, atemp3, a6, y02
1044 LFD a6, 13 * SIZE(AO2)
1046 FMADD1 xsum3, xtemp4, a8, xsum3
1047 LFD xtemp4, 15 * SIZE(XX)
1048 FNMSUB y03, atemp4, a8, y03
1051 FMADD2 xsum4, xtemp3, a8, xsum4
1052 LFD xtemp3, 14 * SIZE(XX)
1053 FMADD y04, atemp3, a8, y04
1054 LFD a8, 15 * SIZE(AO2)
1056 FMADD xsum1, xtemp1, a1, xsum1
1057 STFD y01, 8 * SIZE(YY)
1058 FMADD y05, atemp1, a1, y05
1061 FMADD xsum2, xtemp2, a1, xsum2
1062 STFD y02, 9 * SIZE(YY)
1063 FMADD y06, atemp2, a1, y06
1064 LFD a1, 16 * SIZE(AO1)
1066 FMADD xsum3, xtemp1, a5, xsum3
1067 STFD y03, 10 * SIZE(YY)
1068 FMADD y07, atemp1, a3, y07
1071 FMADD xsum4, xtemp2, a5, xsum4
1072 STFD y04, 11 * SIZE(YY)
1073 FMADD y08, atemp2, a3, y08
1076 FMADD1 xsum1, xtemp2, a2, xsum1
1077 LFD y01, 16 * SIZE(YY)
1078 FNMSUB y05, atemp2, a2, y05
1081 FMADD2 xsum2, xtemp1, a2, xsum2
1082 LFD y02, 17 * SIZE(YY)
1083 FMADD y06, atemp1, a2, y06
1084 LFD a2, 17 * SIZE(AO1)
1086 FMADD1 xsum3, xtemp2, a6, xsum3
1087 LFD xtemp2, 17 * SIZE(XX)
1088 FNMSUB y07, atemp2, a4, y07
1091 FMADD2 xsum4, xtemp1, a6, xsum4
1092 LFD xtemp1, 16 * SIZE(XX)
1093 FMADD y08, atemp1, a4, y08
1094 addi AO2, AO2, 16 * SIZE
1096 FMADD xsum1, xtemp3, a3, xsum1
1097 LFD y03, 18 * SIZE(YY)
1098 FMADD y05, atemp3, a5, y05
1099 addi XX, XX, 16 * SIZE
1101 FMADD xsum2, xtemp4, a3, xsum2
1102 LFD a3, 18 * SIZE(AO1)
1103 FMADD y06, atemp4, a5, y06
1104 LFD a5, 0 * SIZE(AO2)
1106 FMADD xsum3, xtemp3, a7, xsum3
1107 LFD y04, 19 * SIZE(YY)
1108 FMADD y07, atemp3, a7, y07
1111 FMADD xsum4, xtemp4, a7, xsum4
1112 addi AO1, AO1, 16 * SIZE
1113 FMADD y08, atemp4, a7, y08
1114 LFD a7, 2 * SIZE(AO2)
1116 FMADD1 xsum1, xtemp4, a4, xsum1
1117 addi YY, YY, 16 * SIZE
1118 FNMSUB y05, atemp4, a6, y05
1121 FMADD2 xsum2, xtemp3, a4, xsum2
1122 LFD a4, 3 * SIZE(AO1)
1123 FMADD y06, atemp3, a6, y06
1124 LFD a6, 1 * SIZE(AO2)
1126 FMADD1 xsum3, xtemp4, a8, xsum3
1127 LFD xtemp4, 3 * SIZE(XX)
1128 FNMSUB y07, atemp4, a8, y07
1131 FMADD2 xsum4, xtemp3, a8, xsum4
1132 LFD xtemp3, 2 * SIZE(XX)
1133 FMADD y08, atemp3, a8, y08
1134 LFD a8, 3 * SIZE(AO2)
1136 STFD y05, -4 * SIZE(YY)
1137 STFD y06, -3 * SIZE(YY)
1138 STFD y07, -2 * SIZE(YY)
1139 STFD y08, -1 * SIZE(YY)
1146 FMADD xsum1, xtemp1, a1, xsum1
1148 FMADD y01, atemp1, a1, y01
1151 FMADD xsum2, xtemp2, a1, xsum2
1153 FMADD y02, atemp2, a1, y02
1154 LFD a1, 4 * SIZE(AO1)
1156 FMADD xsum3, xtemp1, a5, xsum3
1158 FMADD y03, atemp1, a3, y03
1161 FMADD xsum4, xtemp2, a5, xsum4
1163 FMADD y04, atemp2, a3, y04
1166 FMADD1 xsum1, xtemp2, a2, xsum1
1167 LFD y05, 4 * SIZE(YY)
1168 FNMSUB y01, atemp2, a2, y01
1171 FMADD2 xsum2, xtemp1, a2, xsum2
1172 LFD y06, 5 * SIZE(YY)
1173 FMADD y02, atemp1, a2, y02
1174 LFD a2, 5 * SIZE(AO1)
1176 FMADD1 xsum3, xtemp2, a6, xsum3
1177 LFD xtemp2, 5 * SIZE(XX)
1178 FNMSUB y03, atemp2, a4, y03
1181 FMADD2 xsum4, xtemp1, a6, xsum4
1182 LFD xtemp1, 4 * SIZE(XX)
1183 FMADD y04, atemp1, a4, y04
1186 FMADD xsum1, xtemp3, a3, xsum1
1187 LFD y07, 6 * SIZE(YY)
1188 FMADD y01, atemp3, a5, y01
1191 FMADD xsum2, xtemp4, a3, xsum2
1192 LFD a3, 6 * SIZE(AO1)
1193 FMADD y02, atemp4, a5, y02
1194 LFD a5, 4 * SIZE(AO2)
1196 FMADD xsum3, xtemp3, a7, xsum3
1197 LFD y08, 7 * SIZE(YY)
1198 FMADD y03, atemp3, a7, y03
1201 FMADD xsum4, xtemp4, a7, xsum4
1203 FMADD y04, atemp4, a7, y04
1204 LFD a7, 6 * SIZE(AO2)
1206 FMADD1 xsum1, xtemp4, a4, xsum1
1208 FNMSUB y01, atemp4, a6, y01
1211 FMADD2 xsum2, xtemp3, a4, xsum2
1212 LFD a4, 7 * SIZE(AO1)
1213 FMADD y02, atemp3, a6, y02
1214 LFD a6, 5 * SIZE(AO2)
1216 FMADD1 xsum3, xtemp4, a8, xsum3
1217 LFD xtemp4, 7 * SIZE(XX)
1218 FNMSUB y03, atemp4, a8, y03
1221 FMADD2 xsum4, xtemp3, a8, xsum4
1222 LFD xtemp3, 6 * SIZE(XX)
1223 FMADD y04, atemp3, a8, y04
1224 LFD a8, 7 * SIZE(AO2)
1226 FMADD xsum1, xtemp1, a1, xsum1
1227 STFD y01, 0 * SIZE(YY)
1228 FMADD y05, atemp1, a1, y05
1231 FMADD xsum2, xtemp2, a1, xsum2
1232 STFD y02, 1 * SIZE(YY)
1233 FMADD y06, atemp2, a1, y06
1234 LFD a1, 8 * SIZE(AO1)
1236 FMADD xsum3, xtemp1, a5, xsum3
1237 STFD y03, 2 * SIZE(YY)
1238 FMADD y07, atemp1, a3, y07
1241 FMADD xsum4, xtemp2, a5, xsum4
1242 STFD y04, 3 * SIZE(YY)
1243 FMADD y08, atemp2, a3, y08
1246 FMADD1 xsum1, xtemp2, a2, xsum1
1247 LFD y01, 8 * SIZE(YY)
1248 FNMSUB y05, atemp2, a2, y05
1251 FMADD2 xsum2, xtemp1, a2, xsum2
1252 LFD y02, 9 * SIZE(YY)
1253 FMADD y06, atemp1, a2, y06
1254 LFD a2, 9 * SIZE(AO1)
1256 FMADD1 xsum3, xtemp2, a6, xsum3
1257 LFD xtemp2, 9 * SIZE(XX)
1258 FNMSUB y07, atemp2, a4, y07
1261 FMADD2 xsum4, xtemp1, a6, xsum4
1262 LFD xtemp1, 8 * SIZE(XX)
1263 FMADD y08, atemp1, a4, y08
1266 FMADD xsum1, xtemp3, a3, xsum1
1267 LFD y03, 10 * SIZE(YY)
1268 FMADD y05, atemp3, a5, y05
1271 FMADD xsum2, xtemp4, a3, xsum2
1272 LFD a3, 10 * SIZE(AO1)
1273 FMADD y06, atemp4, a5, y06
1274 LFD a5, 8 * SIZE(AO2)
1276 FMADD xsum3, xtemp3, a7, xsum3
1277 LFD y04, 11 * SIZE(YY)
1278 FMADD y07, atemp3, a7, y07
1281 FMADD xsum4, xtemp4, a7, xsum4
1282 addi YY, YY, 8 * SIZE
1283 FMADD y08, atemp4, a7, y08
1284 LFD a7, 10 * SIZE(AO2)
1286 FMADD1 xsum1, xtemp4, a4, xsum1
1287 addi AO2, AO2, 8 * SIZE
1288 FNMSUB y05, atemp4, a6, y05
1289 addi XX, XX, 8 * SIZE
1291 FMADD2 xsum2, xtemp3, a4, xsum2
1292 LFD a4, 11 * SIZE(AO1)
1293 FMADD y06, atemp3, a6, y06
1294 LFD a6, 1 * SIZE(AO2)
1296 FMADD1 xsum3, xtemp4, a8, xsum3
1297 LFD xtemp4, 3 * SIZE(XX)
1298 FNMSUB y07, atemp4, a8, y07
1299 addi AO1, AO1, 8 * SIZE
1301 FMADD2 xsum4, xtemp3, a8, xsum4
1302 LFD xtemp3, 2 * SIZE(XX)
1303 FMADD y08, atemp3, a8, y08
1304 LFD a8, 3 * SIZE(AO2)
1306 STFD y05, -4 * SIZE(YY)
1307 STFD y06, -3 * SIZE(YY)
1308 STFD y07, -2 * SIZE(YY)
1309 STFD y08, -1 * SIZE(YY)
1316 FMADD xsum1, xtemp1, a1, xsum1
1318 FMADD y01, atemp1, a1, y01
1321 FMADD xsum2, xtemp2, a1, xsum2
1323 FMADD y02, atemp2, a1, y02
1324 LFD a1, 4 * SIZE(AO1)
1326 FMADD xsum3, xtemp1, a5, xsum3
1327 FMADD y03, atemp1, a3, y03
1328 FMADD xsum4, xtemp2, a5, xsum4
1329 FMADD y04, atemp2, a3, y04
1331 FMADD1 xsum1, xtemp2, a2, xsum1
1333 FNMSUB y01, atemp2, a2, y01
1336 FMADD2 xsum2, xtemp1, a2, xsum2
1338 FMADD y02, atemp1, a2, y02
1339 LFD a2, 5 * SIZE(AO1)
1341 FMADD1 xsum3, xtemp2, a6, xsum3
1342 LFD xtemp2, 5 * SIZE(XX)
1343 FNMSUB y03, atemp2, a4, y03
1346 FMADD2 xsum4, xtemp1, a6, xsum4
1347 LFD xtemp1, 4 * SIZE(XX)
1348 FMADD y04, atemp1, a4, y04
1351 FMADD xsum1, xtemp3, a3, xsum1
1353 FMADD y01, atemp3, a5, y01
1356 FMADD xsum2, xtemp4, a3, xsum2
1358 FMADD y02, atemp4, a5, y02
1359 LFD a5, 4 * SIZE(AO2)
1361 FMADD xsum3, xtemp3, a7, xsum3
1362 FMADD y03, atemp3, a7, y03
1363 FMADD xsum4, xtemp4, a7, xsum4
1364 FMADD y04, atemp4, a7, y04
1366 FMADD1 xsum1, xtemp4, a4, xsum1
1368 FNMSUB y01, atemp4, a6, y01
1371 FMADD2 xsum2, xtemp3, a4, xsum2
1373 FMADD y02, atemp3, a6, y02
1374 LFD a6, 5 * SIZE(AO2)
1376 FMADD1 xsum3, xtemp4, a8, xsum3
1377 addi AO1, AO1, 4 * SIZE
1378 FNMSUB y03, atemp4, a8, y03
1379 addi AO2, AO2, 4 * SIZE
1380 FMADD2 xsum4, xtemp3, a8, xsum4
1381 addi YY, YY, 4 * SIZE
1382 FMADD y04, atemp3, a8, y04
1385 STFD y01, -4 * SIZE(YY)
1386 LFD y01, 0 * SIZE(YY)
1387 STFD y02, -3 * SIZE(YY)
1388 LFD y02, 1 * SIZE(YY)
1390 STFD y03, -2 * SIZE(YY)
1391 STFD y04, -1 * SIZE(YY)
1398 FMADD xsum1, xtemp1, a1, xsum1
1399 FMADD y01, atemp1, a1, y01
1400 FMADD xsum2, xtemp2, a1, xsum2
1401 FMADD y02, atemp2, a1, y02
1402 FMADD xsum3, xtemp1, a5, xsum3
1403 FNMSUB y01, atemp2, a2, y01
1404 FMADD xsum4, xtemp2, a5, xsum4
1405 FMADD y02, atemp1, a2, y02
1407 FMADD1 xsum1, xtemp2, a2, xsum1
1408 FMADD y01, atemp3, a5, y01
1409 FMADD2 xsum2, xtemp1, a2, xsum2
1410 FMADD y02, atemp4, a5, y02
1411 FMADD1 xsum3, xtemp2, a6, xsum3
1412 FNMSUB y01, atemp4, a6, y01
1413 FMADD2 xsum4, xtemp1, a6, xsum4
1414 FMADD y02, atemp3, a6, y02
1416 STFD y01, 0 * SIZE(YY)
1417 STFD y02, 1 * SIZE(YY)
1418 STFD y03, 2 * SIZE(YY)
1419 STFD y04, 3 * SIZE(YY)
1426 slwi TEMP, IS, ZBASE_SHIFT
1429 LFD y01, 0 * SIZE(YY)
1430 LFD y02, 1 * SIZE(YY)
1431 LFD y03, 2 * SIZE(YY)
1432 LFD y04, 3 * SIZE(YY)
1434 FMUL xtemp1, y05, xsum1
1435 FMUL xtemp2, y06, xsum1
1436 FMUL xtemp3, y05, xsum3
1437 FMUL xtemp4, y06, xsum3
1439 FNMSUB xsum1, y06, xsum2, xtemp1
1440 FMADD xsum2, y05, xsum2, xtemp2
1441 FNMSUB xsum3, y06, xsum4, xtemp3
1442 FMADD xsum4, y05, xsum4, xtemp4
1444 FADD y01, y01, xsum1
1445 FADD y02, y02, xsum2
1446 FADD y03, y03, xsum3
1447 FADD y04, y04, xsum4
1449 STFD y01, 0 * SIZE(YY)
1451 STFD y02, 1 * SIZE(YY)
1453 STFD y03, 2 * SIZE(YY)
1455 STFD y04, 3 * SIZE(YY)
1463 slwi TEMP, IS, ZBASE_SHIFT
1470 LFD atemp1, 0 * SIZE(XX)
1471 LFD atemp2, 1 * SIZE(XX)
1476 FMUL xsum1, atemp1, a1
1477 FMUL xsum2, atemp2, a1
1480 FNMSUB xsum1, atemp2, a2, xsum1
1481 FMADD xsum2, atemp1, a2, xsum2
1484 FMUL xtemp1, y05, atemp1
1485 FMUL xtemp2, y06, atemp1
1487 FNMSUB atemp1, y06, atemp2, xtemp1
1488 FMADD atemp2, y05, atemp2, xtemp2
1493 LFD y01, 0 * SIZE(YY)
1494 LFD y02, 1 * SIZE(YY)
1496 FMUL xtemp1, y05, xsum1
1497 FMUL xtemp2, y06, xsum1
1499 FNMSUB xsum1, y06, xsum2, xtemp1
1500 FMADD xsum2, y05, xsum2, xtemp2
1502 FADD y01, y01, xsum1
1503 FADD y02, y02, xsum2
1505 STFD y01, 0 * SIZE(YY)
1506 STFD y02, 1 * SIZE(YY)
1510 cmpwi cr0, INCY, 2 * SIZE
1534 LFD f8, 0 * SIZE(NEW_Y)
1535 LFD f9, 1 * SIZE(NEW_Y)
1536 LFD f10, 2 * SIZE(NEW_Y)
1537 LFD f11, 3 * SIZE(NEW_Y)
1538 LFD f12, 4 * SIZE(NEW_Y)
1539 LFD f13, 5 * SIZE(NEW_Y)
1540 LFD f14, 6 * SIZE(NEW_Y)
1541 LFD f15, 7 * SIZE(NEW_Y)
1542 addi NEW_Y, NEW_Y, 8 * SIZE
1553 STFD f8, 0 * SIZE(YY)
1554 STFD f9, 1 * SIZE(YY)
1556 STFD f10, 0 * SIZE(YY)
1557 STFD f11, 1 * SIZE(YY)
1559 STFD f12, 0 * SIZE(YY)
1560 STFD f13, 1 * SIZE(YY)
1562 STFD f14, 0 * SIZE(YY)
1563 STFD f15, 1 * SIZE(YY)
1579 LFD f8, 0 * SIZE(NEW_Y)
1580 LFD f9, 1 * SIZE(NEW_Y)
1581 LFD f10, 2 * SIZE(NEW_Y)
1582 LFD f11, 3 * SIZE(NEW_Y)
1583 addi NEW_Y, NEW_Y, 4 * SIZE
1590 STFD f8, 0 * SIZE(YY)
1591 STFD f9, 1 * SIZE(YY)
1593 STFD f10, 0 * SIZE(YY)
1594 STFD f11, 1 * SIZE(YY)
1605 LFD f8, 0 * SIZE(NEW_Y)
1606 LFD f9, 1 * SIZE(NEW_Y)
1611 STFD f8, 0 * SIZE(YY)
1612 STFD f9, 1 * SIZE(YY)
1669 addi SP, SP, STACKSIZE