1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
66 #if defined(_AIX) || defined(__APPLE__)
67 #if !defined(__64BIT__) && defined(DOUBLE)
142 #define PREFETCHSIZE_A 24
145 #if defined(PPC440) || defined(PPC440FP2)
146 #define PREFETCHSIZE_A 24
150 #define PREFETCHSIZE_A 32
154 #define PREFETCHSIZE_A 72
158 #define PREFETCHSIZE_A 16
162 #define PREFETCHSIZE_A 96
166 #define PREFETCHSIZE_A 112
170 #define PREFETCHSIZE_A 112
173 #if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8)
177 #define NOP1 mr LDA, LDA
178 #define NOP2 mr INCX, INCX
184 #define STACKSIZE 224
185 #define ALPHA_R 200(SP)
186 #define ALPHA_I 208(SP)
187 #define FZERO 216(SP)
189 #define STACKSIZE 280
190 #define ALPHA_R 256(SP)
191 #define ALPHA_I 264(SP)
192 #define FZERO 272(SP)
196 #define FMADD1 FNMSUB
200 #define FMADD2 FNMSUB
206 addi SP, SP, -STACKSIZE
265 lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
267 ld Y, FRAMESLOT(0) + STACKSIZE(SP)
268 ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
269 ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
273 #if defined(_AIX) || defined(__APPLE__)
276 lwz X, FRAMESLOT(0) + STACKSIZE(SP)
277 lwz INCX, FRAMESLOT(1) + STACKSIZE(SP)
278 lwz Y, FRAMESLOT(2) + STACKSIZE(SP)
279 lwz INCY, FRAMESLOT(3) + STACKSIZE(SP)
280 lwz BUFFER, FRAMESLOT(4) + STACKSIZE(SP)
282 lwz Y, FRAMESLOT(0) + STACKSIZE(SP)
283 lwz INCY, FRAMESLOT(1) + STACKSIZE(SP)
284 lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
287 ld Y, FRAMESLOT(0) + STACKSIZE(SP)
288 ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
289 ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
293 STFD alpha_r, ALPHA_R
294 STFD alpha_i, ALPHA_I
296 slwi LDA, LDA, ZBASE_SHIFT
297 slwi INCX, INCX, ZBASE_SHIFT
298 slwi INCY, INCY, ZBASE_SHIFT
300 li PREA, PREFETCHSIZE_A * SIZE
309 cmpwi cr0, INCX, 2 * SIZE
337 STFD a1, 0 * SIZE(BUFFER)
338 STFD a2, 1 * SIZE(BUFFER)
339 STFD a3, 2 * SIZE(BUFFER)
340 STFD a4, 3 * SIZE(BUFFER)
341 STFD a5, 4 * SIZE(BUFFER)
342 STFD a6, 5 * SIZE(BUFFER)
343 STFD a7, 6 * SIZE(BUFFER)
344 STFD a8, 7 * SIZE(BUFFER)
346 addi BUFFER, BUFFER, 8 * SIZE
361 STFD a1, 0 * SIZE(BUFFER)
362 STFD a2, 1 * SIZE(BUFFER)
364 addi BUFFER, BUFFER, 2 * SIZE
372 cmpwi cr0, INCY, 2 * SIZE
383 STFD f0, 0 * SIZE(BUFFER)
384 STFD f0, 1 * SIZE(BUFFER)
385 STFD f0, 2 * SIZE(BUFFER)
386 STFD f0, 3 * SIZE(BUFFER)
387 STFD f0, 4 * SIZE(BUFFER)
388 STFD f0, 5 * SIZE(BUFFER)
389 STFD f0, 6 * SIZE(BUFFER)
390 STFD f0, 7 * SIZE(BUFFER)
391 addi BUFFER, BUFFER, 8 * SIZE
406 slwi TEMP, IS, ZBASE_SHIFT
412 LFD xtemp1, 0 * SIZE(TEMP)
413 LFD xtemp2, 1 * SIZE(TEMP)
414 LFD xtemp3, 2 * SIZE(TEMP)
415 LFD xtemp4, 3 * SIZE(TEMP)
417 FMUL atemp1, y05, xtemp1
418 FMUL atemp2, y06, xtemp1
419 FMUL atemp3, y05, xtemp3
420 FMUL atemp4, y06, xtemp3
422 FNMSUB atemp1, y06, xtemp2, atemp1
423 FMADD atemp2, y05, xtemp2, atemp2
424 FNMSUB atemp3, y06, xtemp4, atemp3
425 FMADD atemp4, y05, xtemp4, atemp4
435 LFD a1, 0 * SIZE(AO1)
436 LFD a2, 1 * SIZE(AO1)
437 LFD a3, 2 * SIZE(AO1)
438 LFD a4, 3 * SIZE(AO1)
440 LFD a5, 0 * SIZE(AO2)
441 LFD a6, 1 * SIZE(AO2)
442 LFD a7, 2 * SIZE(AO2)
443 LFD a8, 3 * SIZE(AO2)
445 LFD xtemp1, 0 * SIZE(XX)
446 LFD xtemp2, 1 * SIZE(XX)
447 LFD xtemp3, 2 * SIZE(XX)
448 LFD xtemp4, 3 * SIZE(XX)
450 LFD y01, 0 * SIZE(YY)
451 LFD y02, 1 * SIZE(YY)
452 LFD y03, 2 * SIZE(YY)
453 LFD y04, 3 * SIZE(YY)
459 FMADD xsum1, xtemp1, a1, xsum1
461 FMADD y01, atemp1, a1, y01
464 FMADD xsum2, xtemp2, a1, xsum2
466 FMADD y02, atemp2, a1, y02
467 LFD a1, 4 * SIZE(AO1)
469 FMADD xsum3, xtemp1, a5, xsum3
471 FMADD y03, atemp1, a3, y03
474 FMADD xsum4, xtemp2, a5, xsum4
476 FMADD y04, atemp2, a3, y04
479 FMADD1 xsum1, xtemp2, a2, xsum1
480 LFD y05, 4 * SIZE(YY)
481 FNMSUB y01, atemp2, a2, y01
484 FMADD2 xsum2, xtemp1, a2, xsum2
485 LFD y06, 5 * SIZE(YY)
486 FMADD y02, atemp1, a2, y02
487 LFD a2, 5 * SIZE(AO1)
489 FMADD1 xsum3, xtemp2, a6, xsum3
490 LFD xtemp2, 5 * SIZE(XX)
491 FNMSUB y03, atemp2, a4, y03
494 FMADD2 xsum4, xtemp1, a6, xsum4
495 LFD xtemp1, 4 * SIZE(XX)
496 FMADD y04, atemp1, a4, y04
499 FMADD xsum1, xtemp3, a3, xsum1
500 LFD y07, 6 * SIZE(YY)
501 FMADD y01, atemp3, a5, y01
504 FMADD xsum2, xtemp4, a3, xsum2
505 LFD a3, 6 * SIZE(AO1)
506 FMADD y02, atemp4, a5, y02
507 LFD a5, 4 * SIZE(AO2)
509 FMADD xsum3, xtemp3, a7, xsum3
510 LFD y08, 7 * SIZE(YY)
511 FMADD y03, atemp3, a7, y03
514 FMADD xsum4, xtemp4, a7, xsum4
516 FMADD y04, atemp4, a7, y04
517 LFD a7, 6 * SIZE(AO2)
519 FMADD1 xsum1, xtemp4, a4, xsum1
521 FNMSUB y01, atemp4, a6, y01
525 FMADD2 xsum2, xtemp3, a4, xsum2
526 LFD a4, 7 * SIZE(AO1)
527 FMADD y02, atemp3, a6, y02
528 LFD a6, 5 * SIZE(AO2)
530 FMADD1 xsum3, xtemp4, a8, xsum3
531 LFD xtemp4, 7 * SIZE(XX)
532 FNMSUB y03, atemp4, a8, y03
535 FMADD2 xsum4, xtemp3, a8, xsum4
536 LFD xtemp3, 6 * SIZE(XX)
537 FMADD y04, atemp3, a8, y04
538 LFD a8, 7 * SIZE(AO2)
540 FMADD xsum1, xtemp1, a1, xsum1
541 STFD y01, 0 * SIZE(YY)
542 FMADD y05, atemp1, a1, y05
545 FMADD xsum2, xtemp2, a1, xsum2
546 STFD y02, 1 * SIZE(YY)
547 FMADD y06, atemp2, a1, y06
548 LFD a1, 8 * SIZE(AO1)
550 FMADD xsum3, xtemp1, a5, xsum3
551 STFD y03, 2 * SIZE(YY)
552 FMADD y07, atemp1, a3, y07
555 FMADD xsum4, xtemp2, a5, xsum4
556 STFD y04, 3 * SIZE(YY)
557 FMADD y08, atemp2, a3, y08
560 FMADD1 xsum1, xtemp2, a2, xsum1
561 LFD y01, 8 * SIZE(YY)
562 FNMSUB y05, atemp2, a2, y05
565 FMADD2 xsum2, xtemp1, a2, xsum2
566 LFD y02, 9 * SIZE(YY)
567 FMADD y06, atemp1, a2, y06
568 LFD a2, 9 * SIZE(AO1)
570 FMADD1 xsum3, xtemp2, a6, xsum3
571 LFD xtemp2, 9 * SIZE(XX)
572 FNMSUB y07, atemp2, a4, y07
575 FMADD2 xsum4, xtemp1, a6, xsum4
576 LFD xtemp1, 8 * SIZE(XX)
577 FMADD y08, atemp1, a4, y08
580 FMADD xsum1, xtemp3, a3, xsum1
581 LFD y03, 10 * SIZE(YY)
582 FMADD y05, atemp3, a5, y05
585 FMADD xsum2, xtemp4, a3, xsum2
586 LFD a3, 10 * SIZE(AO1)
587 FMADD y06, atemp4, a5, y06
588 LFD a5, 8 * SIZE(AO2)
590 FMADD xsum3, xtemp3, a7, xsum3
591 LFD y04, 11 * SIZE(YY)
592 FMADD y07, atemp3, a7, y07
595 FMADD xsum4, xtemp4, a7, xsum4
597 FMADD y08, atemp4, a7, y08
598 LFD a7, 10 * SIZE(AO2)
600 FMADD1 xsum1, xtemp4, a4, xsum1
602 FNMSUB y05, atemp4, a6, y05
605 FMADD2 xsum2, xtemp3, a4, xsum2
606 LFD a4, 11 * SIZE(AO1)
607 FMADD y06, atemp3, a6, y06
608 LFD a6, 9 * SIZE(AO2)
610 FMADD1 xsum3, xtemp4, a8, xsum3
611 LFD xtemp4, 11 * SIZE(XX)
612 FNMSUB y07, atemp4, a8, y07
617 FMADD2 xsum4, xtemp3, a8, xsum4
618 LFD xtemp3, 10 * SIZE(XX)
619 FMADD y08, atemp3, a8, y08
620 LFD a8, 11 * SIZE(AO2)
622 FMADD xsum1, xtemp1, a1, xsum1
623 STFD y05, 4 * SIZE(YY)
624 FMADD y01, atemp1, a1, y01
627 FMADD xsum2, xtemp2, a1, xsum2
628 STFD y06, 5 * SIZE(YY)
629 FMADD y02, atemp2, a1, y02
630 LFD a1, 12 * SIZE(AO1)
632 FMADD xsum3, xtemp1, a5, xsum3
633 STFD y07, 6 * SIZE(YY)
634 FMADD y03, atemp1, a3, y03
637 FMADD xsum4, xtemp2, a5, xsum4
638 STFD y08, 7 * SIZE(YY)
639 FMADD y04, atemp2, a3, y04
642 FMADD1 xsum1, xtemp2, a2, xsum1
643 LFD y05, 12 * SIZE(YY)
644 FNMSUB y01, atemp2, a2, y01
647 FMADD2 xsum2, xtemp1, a2, xsum2
648 LFD y06, 13 * SIZE(YY)
649 FMADD y02, atemp1, a2, y02
650 LFD a2, 13 * SIZE(AO1)
652 FMADD1 xsum3, xtemp2, a6, xsum3
653 LFD xtemp2, 13 * SIZE(XX)
654 FNMSUB y03, atemp2, a4, y03
657 FMADD2 xsum4, xtemp1, a6, xsum4
658 LFD xtemp1, 12 * SIZE(XX)
659 FMADD y04, atemp1, a4, y04
662 FMADD xsum1, xtemp3, a3, xsum1
663 LFD y07, 14 * SIZE(YY)
664 FMADD y01, atemp3, a5, y01
667 FMADD xsum2, xtemp4, a3, xsum2
668 LFD a3, 14 * SIZE(AO1)
669 FMADD y02, atemp4, a5, y02
670 LFD a5, 12 * SIZE(AO2)
672 FMADD xsum3, xtemp3, a7, xsum3
673 LFD y08, 15 * SIZE(YY)
674 FMADD y03, atemp3, a7, y03
677 FMADD xsum4, xtemp4, a7, xsum4
679 FMADD y04, atemp4, a7, y04
680 LFD a7, 14 * SIZE(AO2)
682 FMADD1 xsum1, xtemp4, a4, xsum1
684 FNMSUB y01, atemp4, a6, y01
688 FMADD2 xsum2, xtemp3, a4, xsum2
689 LFD a4, 15 * SIZE(AO1)
690 FMADD y02, atemp3, a6, y02
691 LFD a6, 13 * SIZE(AO2)
693 FMADD1 xsum3, xtemp4, a8, xsum3
694 LFD xtemp4, 15 * SIZE(XX)
695 FNMSUB y03, atemp4, a8, y03
698 FMADD2 xsum4, xtemp3, a8, xsum4
699 LFD xtemp3, 14 * SIZE(XX)
700 FMADD y04, atemp3, a8, y04
701 LFD a8, 15 * SIZE(AO2)
703 FMADD xsum1, xtemp1, a1, xsum1
704 STFD y01, 8 * SIZE(YY)
705 FMADD y05, atemp1, a1, y05
708 FMADD xsum2, xtemp2, a1, xsum2
709 STFD y02, 9 * SIZE(YY)
710 FMADD y06, atemp2, a1, y06
711 LFD a1, 16 * SIZE(AO1)
713 FMADD xsum3, xtemp1, a5, xsum3
714 STFD y03, 10 * SIZE(YY)
715 FMADD y07, atemp1, a3, y07
718 FMADD xsum4, xtemp2, a5, xsum4
719 STFD y04, 11 * SIZE(YY)
720 FMADD y08, atemp2, a3, y08
723 FMADD1 xsum1, xtemp2, a2, xsum1
724 LFD y01, 16 * SIZE(YY)
725 FNMSUB y05, atemp2, a2, y05
728 FMADD2 xsum2, xtemp1, a2, xsum2
729 LFD y02, 17 * SIZE(YY)
730 FMADD y06, atemp1, a2, y06
731 LFD a2, 17 * SIZE(AO1)
733 FMADD1 xsum3, xtemp2, a6, xsum3
734 LFD xtemp2, 17 * SIZE(XX)
735 FNMSUB y07, atemp2, a4, y07
738 FMADD2 xsum4, xtemp1, a6, xsum4
739 LFD xtemp1, 16 * SIZE(XX)
740 FMADD y08, atemp1, a4, y08
741 addi AO2, AO2, 16 * SIZE
743 FMADD xsum1, xtemp3, a3, xsum1
744 LFD y03, 18 * SIZE(YY)
745 FMADD y05, atemp3, a5, y05
746 addi XX, XX, 16 * SIZE
748 FMADD xsum2, xtemp4, a3, xsum2
749 LFD a3, 18 * SIZE(AO1)
750 FMADD y06, atemp4, a5, y06
751 LFD a5, 0 * SIZE(AO2)
753 FMADD xsum3, xtemp3, a7, xsum3
754 LFD y04, 19 * SIZE(YY)
755 FMADD y07, atemp3, a7, y07
758 FMADD xsum4, xtemp4, a7, xsum4
759 addi AO1, AO1, 16 * SIZE
760 FMADD y08, atemp4, a7, y08
761 LFD a7, 2 * SIZE(AO2)
763 FMADD1 xsum1, xtemp4, a4, xsum1
764 addi YY, YY, 16 * SIZE
765 FNMSUB y05, atemp4, a6, y05
768 FMADD2 xsum2, xtemp3, a4, xsum2
769 LFD a4, 3 * SIZE(AO1)
770 FMADD y06, atemp3, a6, y06
771 LFD a6, 1 * SIZE(AO2)
773 FMADD1 xsum3, xtemp4, a8, xsum3
774 LFD xtemp4, 3 * SIZE(XX)
775 FNMSUB y07, atemp4, a8, y07
778 FMADD2 xsum4, xtemp3, a8, xsum4
779 LFD xtemp3, 2 * SIZE(XX)
780 FMADD y08, atemp3, a8, y08
781 LFD a8, 3 * SIZE(AO2)
783 FMADD xsum1, xtemp1, a1, xsum1
784 STFD y05, -4 * SIZE(YY)
785 FMADD y01, atemp1, a1, y01
788 FMADD xsum2, xtemp2, a1, xsum2
789 STFD y06, -3 * SIZE(YY)
790 FMADD y02, atemp2, a1, y02
791 LFD a1, 4 * SIZE(AO1)
793 FMADD xsum3, xtemp1, a5, xsum3
794 STFD y07, -2 * SIZE(YY)
795 FMADD y03, atemp1, a3, y03
798 FMADD xsum4, xtemp2, a5, xsum4
799 STFD y08, -1 * SIZE(YY)
800 FMADD y04, atemp2, a3, y04
803 FMADD1 xsum1, xtemp2, a2, xsum1
804 LFD y05, 4 * SIZE(YY)
805 FNMSUB y01, atemp2, a2, y01
808 FMADD2 xsum2, xtemp1, a2, xsum2
809 LFD y06, 5 * SIZE(YY)
810 FMADD y02, atemp1, a2, y02
811 LFD a2, 5 * SIZE(AO1)
813 FMADD1 xsum3, xtemp2, a6, xsum3
814 LFD xtemp2, 5 * SIZE(XX)
815 FNMSUB y03, atemp2, a4, y03
818 FMADD2 xsum4, xtemp1, a6, xsum4
819 LFD xtemp1, 4 * SIZE(XX)
820 FMADD y04, atemp1, a4, y04
823 FMADD xsum1, xtemp3, a3, xsum1
824 LFD y07, 6 * SIZE(YY)
825 FMADD y01, atemp3, a5, y01
828 FMADD xsum2, xtemp4, a3, xsum2
829 LFD a3, 6 * SIZE(AO1)
830 FMADD y02, atemp4, a5, y02
831 LFD a5, 4 * SIZE(AO2)
833 FMADD xsum3, xtemp3, a7, xsum3
834 LFD y08, 7 * SIZE(YY)
835 FMADD y03, atemp3, a7, y03
838 FMADD xsum4, xtemp4, a7, xsum4
840 FMADD y04, atemp4, a7, y04
841 LFD a7, 6 * SIZE(AO2)
843 FMADD1 xsum1, xtemp4, a4, xsum1
845 FNMSUB y01, atemp4, a6, y01
849 FMADD2 xsum2, xtemp3, a4, xsum2
850 LFD a4, 7 * SIZE(AO1)
851 FMADD y02, atemp3, a6, y02
852 LFD a6, 5 * SIZE(AO2)
854 FMADD1 xsum3, xtemp4, a8, xsum3
855 LFD xtemp4, 7 * SIZE(XX)
856 FNMSUB y03, atemp4, a8, y03
859 FMADD2 xsum4, xtemp3, a8, xsum4
860 LFD xtemp3, 6 * SIZE(XX)
861 FMADD y04, atemp3, a8, y04
862 LFD a8, 7 * SIZE(AO2)
864 FMADD xsum1, xtemp1, a1, xsum1
865 STFD y01, 0 * SIZE(YY)
866 FMADD y05, atemp1, a1, y05
869 FMADD xsum2, xtemp2, a1, xsum2
870 STFD y02, 1 * SIZE(YY)
871 FMADD y06, atemp2, a1, y06
872 LFD a1, 8 * SIZE(AO1)
874 FMADD xsum3, xtemp1, a5, xsum3
875 STFD y03, 2 * SIZE(YY)
876 FMADD y07, atemp1, a3, y07
879 FMADD xsum4, xtemp2, a5, xsum4
880 STFD y04, 3 * SIZE(YY)
881 FMADD y08, atemp2, a3, y08
884 FMADD1 xsum1, xtemp2, a2, xsum1
885 LFD y01, 8 * SIZE(YY)
886 FNMSUB y05, atemp2, a2, y05
889 FMADD2 xsum2, xtemp1, a2, xsum2
890 LFD y02, 9 * SIZE(YY)
891 FMADD y06, atemp1, a2, y06
892 LFD a2, 9 * SIZE(AO1)
894 FMADD1 xsum3, xtemp2, a6, xsum3
895 LFD xtemp2, 9 * SIZE(XX)
896 FNMSUB y07, atemp2, a4, y07
899 FMADD2 xsum4, xtemp1, a6, xsum4
900 LFD xtemp1, 8 * SIZE(XX)
901 FMADD y08, atemp1, a4, y08
904 FMADD xsum1, xtemp3, a3, xsum1
905 LFD y03, 10 * SIZE(YY)
906 FMADD y05, atemp3, a5, y05
909 FMADD xsum2, xtemp4, a3, xsum2
910 LFD a3, 10 * SIZE(AO1)
911 FMADD y06, atemp4, a5, y06
912 LFD a5, 8 * SIZE(AO2)
914 FMADD xsum3, xtemp3, a7, xsum3
915 LFD y04, 11 * SIZE(YY)
916 FMADD y07, atemp3, a7, y07
919 FMADD xsum4, xtemp4, a7, xsum4
921 FMADD y08, atemp4, a7, y08
922 LFD a7, 10 * SIZE(AO2)
924 FMADD1 xsum1, xtemp4, a4, xsum1
926 FNMSUB y05, atemp4, a6, y05
929 FMADD2 xsum2, xtemp3, a4, xsum2
930 LFD a4, 11 * SIZE(AO1)
931 FMADD y06, atemp3, a6, y06
932 LFD a6, 9 * SIZE(AO2)
934 FMADD1 xsum3, xtemp4, a8, xsum3
935 LFD xtemp4, 11 * SIZE(XX)
936 FNMSUB y07, atemp4, a8, y07
941 FMADD2 xsum4, xtemp3, a8, xsum4
942 LFD xtemp3, 10 * SIZE(XX)
943 FMADD y08, atemp3, a8, y08
944 LFD a8, 11 * SIZE(AO2)
946 FMADD xsum1, xtemp1, a1, xsum1
947 STFD y05, 4 * SIZE(YY)
948 FMADD y01, atemp1, a1, y01
951 FMADD xsum2, xtemp2, a1, xsum2
952 STFD y06, 5 * SIZE(YY)
953 FMADD y02, atemp2, a1, y02
954 LFD a1, 12 * SIZE(AO1)
956 FMADD xsum3, xtemp1, a5, xsum3
957 STFD y07, 6 * SIZE(YY)
958 FMADD y03, atemp1, a3, y03
961 FMADD xsum4, xtemp2, a5, xsum4
962 STFD y08, 7 * SIZE(YY)
963 FMADD y04, atemp2, a3, y04
966 FMADD1 xsum1, xtemp2, a2, xsum1
967 LFD y05, 12 * SIZE(YY)
968 FNMSUB y01, atemp2, a2, y01
971 FMADD2 xsum2, xtemp1, a2, xsum2
972 LFD y06, 13 * SIZE(YY)
973 FMADD y02, atemp1, a2, y02
974 LFD a2, 13 * SIZE(AO1)
976 FMADD1 xsum3, xtemp2, a6, xsum3
977 LFD xtemp2, 13 * SIZE(XX)
978 FNMSUB y03, atemp2, a4, y03
981 FMADD2 xsum4, xtemp1, a6, xsum4
982 LFD xtemp1, 12 * SIZE(XX)
983 FMADD y04, atemp1, a4, y04
986 FMADD xsum1, xtemp3, a3, xsum1
987 LFD y07, 14 * SIZE(YY)
988 FMADD y01, atemp3, a5, y01
991 FMADD xsum2, xtemp4, a3, xsum2
992 LFD a3, 14 * SIZE(AO1)
993 FMADD y02, atemp4, a5, y02
994 LFD a5, 12 * SIZE(AO2)
996 FMADD xsum3, xtemp3, a7, xsum3
997 LFD y08, 15 * SIZE(YY)
998 FMADD y03, atemp3, a7, y03
1001 FMADD xsum4, xtemp4, a7, xsum4
1003 FMADD y04, atemp4, a7, y04
1004 LFD a7, 14 * SIZE(AO2)
1006 FMADD1 xsum1, xtemp4, a4, xsum1
1008 FNMSUB y01, atemp4, a6, y01
1011 FMADD2 xsum2, xtemp3, a4, xsum2
1012 LFD a4, 15 * SIZE(AO1)
1013 FMADD y02, atemp3, a6, y02
1014 LFD a6, 13 * SIZE(AO2)
1016 FMADD1 xsum3, xtemp4, a8, xsum3
1017 LFD xtemp4, 15 * SIZE(XX)
1018 FNMSUB y03, atemp4, a8, y03
1021 FMADD2 xsum4, xtemp3, a8, xsum4
1022 LFD xtemp3, 14 * SIZE(XX)
1023 FMADD y04, atemp3, a8, y04
1024 LFD a8, 15 * SIZE(AO2)
1026 FMADD xsum1, xtemp1, a1, xsum1
1027 STFD y01, 8 * SIZE(YY)
1028 FMADD y05, atemp1, a1, y05
1031 FMADD xsum2, xtemp2, a1, xsum2
1032 STFD y02, 9 * SIZE(YY)
1033 FMADD y06, atemp2, a1, y06
1034 LFD a1, 16 * SIZE(AO1)
1036 FMADD xsum3, xtemp1, a5, xsum3
1037 STFD y03, 10 * SIZE(YY)
1038 FMADD y07, atemp1, a3, y07
1041 FMADD xsum4, xtemp2, a5, xsum4
1042 STFD y04, 11 * SIZE(YY)
1043 FMADD y08, atemp2, a3, y08
1046 FMADD1 xsum1, xtemp2, a2, xsum1
1047 LFD y01, 16 * SIZE(YY)
1048 FNMSUB y05, atemp2, a2, y05
1051 FMADD2 xsum2, xtemp1, a2, xsum2
1052 LFD y02, 17 * SIZE(YY)
1053 FMADD y06, atemp1, a2, y06
1054 LFD a2, 17 * SIZE(AO1)
1056 FMADD1 xsum3, xtemp2, a6, xsum3
1057 LFD xtemp2, 17 * SIZE(XX)
1058 FNMSUB y07, atemp2, a4, y07
1061 FMADD2 xsum4, xtemp1, a6, xsum4
1062 LFD xtemp1, 16 * SIZE(XX)
1063 FMADD y08, atemp1, a4, y08
1064 addi AO2, AO2, 16 * SIZE
1066 FMADD xsum1, xtemp3, a3, xsum1
1067 LFD y03, 18 * SIZE(YY)
1068 FMADD y05, atemp3, a5, y05
1069 addi XX, XX, 16 * SIZE
1071 FMADD xsum2, xtemp4, a3, xsum2
1072 LFD a3, 18 * SIZE(AO1)
1073 FMADD y06, atemp4, a5, y06
1074 LFD a5, 0 * SIZE(AO2)
1076 FMADD xsum3, xtemp3, a7, xsum3
1077 LFD y04, 19 * SIZE(YY)
1078 FMADD y07, atemp3, a7, y07
1081 FMADD xsum4, xtemp4, a7, xsum4
1082 addi AO1, AO1, 16 * SIZE
1083 FMADD y08, atemp4, a7, y08
1084 LFD a7, 2 * SIZE(AO2)
1086 FMADD1 xsum1, xtemp4, a4, xsum1
1087 addi YY, YY, 16 * SIZE
1088 FNMSUB y05, atemp4, a6, y05
1091 FMADD2 xsum2, xtemp3, a4, xsum2
1092 LFD a4, 3 * SIZE(AO1)
1093 FMADD y06, atemp3, a6, y06
1094 LFD a6, 1 * SIZE(AO2)
1096 FMADD1 xsum3, xtemp4, a8, xsum3
1097 LFD xtemp4, 3 * SIZE(XX)
1098 FNMSUB y07, atemp4, a8, y07
1101 FMADD2 xsum4, xtemp3, a8, xsum4
1102 LFD xtemp3, 2 * SIZE(XX)
1103 FMADD y08, atemp3, a8, y08
1104 LFD a8, 3 * SIZE(AO2)
1106 STFD y05, -4 * SIZE(YY)
1107 STFD y06, -3 * SIZE(YY)
1108 STFD y07, -2 * SIZE(YY)
1109 STFD y08, -1 * SIZE(YY)
1116 FMADD xsum1, xtemp1, a1, xsum1
1118 FMADD y01, atemp1, a1, y01
1121 FMADD xsum2, xtemp2, a1, xsum2
1123 FMADD y02, atemp2, a1, y02
1124 LFD a1, 4 * SIZE(AO1)
1126 FMADD xsum3, xtemp1, a5, xsum3
1128 FMADD y03, atemp1, a3, y03
1131 FMADD xsum4, xtemp2, a5, xsum4
1133 FMADD y04, atemp2, a3, y04
1136 FMADD1 xsum1, xtemp2, a2, xsum1
1137 LFD y05, 4 * SIZE(YY)
1138 FNMSUB y01, atemp2, a2, y01
1141 FMADD2 xsum2, xtemp1, a2, xsum2
1142 LFD y06, 5 * SIZE(YY)
1143 FMADD y02, atemp1, a2, y02
1144 LFD a2, 5 * SIZE(AO1)
1146 FMADD1 xsum3, xtemp2, a6, xsum3
1147 LFD xtemp2, 5 * SIZE(XX)
1148 FNMSUB y03, atemp2, a4, y03
1151 FMADD2 xsum4, xtemp1, a6, xsum4
1152 LFD xtemp1, 4 * SIZE(XX)
1153 FMADD y04, atemp1, a4, y04
1156 FMADD xsum1, xtemp3, a3, xsum1
1157 LFD y07, 6 * SIZE(YY)
1158 FMADD y01, atemp3, a5, y01
1161 FMADD xsum2, xtemp4, a3, xsum2
1162 LFD a3, 6 * SIZE(AO1)
1163 FMADD y02, atemp4, a5, y02
1164 LFD a5, 4 * SIZE(AO2)
1166 FMADD xsum3, xtemp3, a7, xsum3
1167 LFD y08, 7 * SIZE(YY)
1168 FMADD y03, atemp3, a7, y03
1171 FMADD xsum4, xtemp4, a7, xsum4
1173 FMADD y04, atemp4, a7, y04
1174 LFD a7, 6 * SIZE(AO2)
1176 FMADD1 xsum1, xtemp4, a4, xsum1
1178 FNMSUB y01, atemp4, a6, y01
1181 FMADD2 xsum2, xtemp3, a4, xsum2
1182 LFD a4, 7 * SIZE(AO1)
1183 FMADD y02, atemp3, a6, y02
1184 LFD a6, 5 * SIZE(AO2)
1186 FMADD1 xsum3, xtemp4, a8, xsum3
1187 LFD xtemp4, 7 * SIZE(XX)
1188 FNMSUB y03, atemp4, a8, y03
1191 FMADD2 xsum4, xtemp3, a8, xsum4
1192 LFD xtemp3, 6 * SIZE(XX)
1193 FMADD y04, atemp3, a8, y04
1194 LFD a8, 7 * SIZE(AO2)
1196 FMADD xsum1, xtemp1, a1, xsum1
1197 STFD y01, 0 * SIZE(YY)
1198 FMADD y05, atemp1, a1, y05
1201 FMADD xsum2, xtemp2, a1, xsum2
1202 STFD y02, 1 * SIZE(YY)
1203 FMADD y06, atemp2, a1, y06
1204 LFD a1, 8 * SIZE(AO1)
1206 FMADD xsum3, xtemp1, a5, xsum3
1207 STFD y03, 2 * SIZE(YY)
1208 FMADD y07, atemp1, a3, y07
1211 FMADD xsum4, xtemp2, a5, xsum4
1212 STFD y04, 3 * SIZE(YY)
1213 FMADD y08, atemp2, a3, y08
1216 FMADD1 xsum1, xtemp2, a2, xsum1
1217 LFD y01, 8 * SIZE(YY)
1218 FNMSUB y05, atemp2, a2, y05
1221 FMADD2 xsum2, xtemp1, a2, xsum2
1222 LFD y02, 9 * SIZE(YY)
1223 FMADD y06, atemp1, a2, y06
1224 LFD a2, 9 * SIZE(AO1)
1226 FMADD1 xsum3, xtemp2, a6, xsum3
1227 LFD xtemp2, 9 * SIZE(XX)
1228 FNMSUB y07, atemp2, a4, y07
1231 FMADD2 xsum4, xtemp1, a6, xsum4
1232 LFD xtemp1, 8 * SIZE(XX)
1233 FMADD y08, atemp1, a4, y08
1236 FMADD xsum1, xtemp3, a3, xsum1
1237 LFD y03, 10 * SIZE(YY)
1238 FMADD y05, atemp3, a5, y05
1241 FMADD xsum2, xtemp4, a3, xsum2
1242 LFD a3, 10 * SIZE(AO1)
1243 FMADD y06, atemp4, a5, y06
1244 LFD a5, 8 * SIZE(AO2)
1246 FMADD xsum3, xtemp3, a7, xsum3
1247 LFD y04, 11 * SIZE(YY)
1248 FMADD y07, atemp3, a7, y07
1251 FMADD xsum4, xtemp4, a7, xsum4
1253 FMADD y08, atemp4, a7, y08
1254 LFD a7, 10 * SIZE(AO2)
1256 FMADD1 xsum1, xtemp4, a4, xsum1
1258 FNMSUB y05, atemp4, a6, y05
1261 FMADD2 xsum2, xtemp3, a4, xsum2
1262 LFD a4, 11 * SIZE(AO1)
1263 FMADD y06, atemp3, a6, y06
1264 LFD a6, 9 * SIZE(AO2)
1266 FMADD1 xsum3, xtemp4, a8, xsum3
1267 LFD xtemp4, 11 * SIZE(XX)
1268 FNMSUB y07, atemp4, a8, y07
1270 FMADD2 xsum4, xtemp3, a8, xsum4
1271 LFD xtemp3, 10 * SIZE(XX)
1272 FMADD y08, atemp3, a8, y08
1273 LFD a8, 11 * SIZE(AO2)
1275 STFD y05, 4 * SIZE(YY)
1276 STFD y06, 5 * SIZE(YY)
1277 STFD y07, 6 * SIZE(YY)
1278 STFD y08, 7 * SIZE(YY)
1280 addi AO1, AO1, 8 * SIZE
1281 addi AO2, AO2, 8 * SIZE
1283 addi XX, XX, 8 * SIZE
1284 addi YY, YY, 8 * SIZE
1291 FMADD xsum1, xtemp1, a1, xsum1
1292 FMADD y01, atemp1, a1, y01
1293 FMADD xsum2, xtemp2, a1, xsum2
1294 FMADD y02, atemp2, a1, y02
1295 FMADD xsum3, xtemp1, a5, xsum3
1296 FMADD y03, atemp1, a3, y03
1297 FMADD xsum4, xtemp2, a5, xsum4
1298 FMADD y04, atemp2, a3, y04
1300 FMADD1 xsum1, xtemp2, a2, xsum1
1301 FNMSUB y01, atemp2, a2, y01
1302 FMADD2 xsum2, xtemp1, a2, xsum2
1303 FMADD y02, atemp1, a2, y02
1304 FMADD1 xsum3, xtemp2, a6, xsum3
1305 FNMSUB y03, atemp2, a4, y03
1306 FMADD2 xsum4, xtemp1, a6, xsum4
1307 FMADD y04, atemp1, a4, y04
1309 FMADD xsum1, xtemp3, a3, xsum1
1310 FMADD y01, atemp3, a5, y01
1311 FMADD xsum2, xtemp4, a3, xsum2
1312 FMADD y02, atemp4, a5, y02
1313 FMADD xsum3, xtemp3, a7, xsum3
1314 FMADD y03, atemp3, a7, y03
1315 FMADD xsum4, xtemp4, a7, xsum4
1316 FMADD y04, atemp4, a7, y04
1318 FMADD1 xsum1, xtemp4, a4, xsum1
1319 FNMSUB y01, atemp4, a6, y01
1320 FMADD2 xsum2, xtemp3, a4, xsum2
1321 FMADD y02, atemp3, a6, y02
1322 FMADD1 xsum3, xtemp4, a8, xsum3
1323 FNMSUB y03, atemp4, a8, y03
1324 FMADD2 xsum4, xtemp3, a8, xsum4
1325 FMADD y04, atemp3, a8, y04
1327 STFD y01, 0 * SIZE(YY)
1328 STFD y02, 1 * SIZE(YY)
1329 STFD y03, 2 * SIZE(YY)
1330 STFD y04, 3 * SIZE(YY)
1332 LFD a1, 4 * SIZE(AO1)
1333 LFD a2, 5 * SIZE(AO1)
1335 LFD a5, 4 * SIZE(AO2)
1336 LFD a6, 5 * SIZE(AO2)
1337 LFD a7, 6 * SIZE(AO2)
1338 LFD a8, 7 * SIZE(AO2)
1340 LFD y01, 4 * SIZE(YY)
1341 LFD y02, 5 * SIZE(YY)
1342 LFD y03, 6 * SIZE(YY)
1343 LFD y04, 7 * SIZE(YY)
1345 addi YY, YY, 4 * SIZE
1352 FMUL xtemp1, y05, xsum1
1353 FMUL xtemp2, y06, xsum1
1354 FMUL xtemp3, y05, xsum3
1355 FMUL xtemp4, y06, xsum3
1357 FNMSUB xsum1, y06, xsum2, xtemp1
1358 FMADD xsum2, y05, xsum2, xtemp2
1359 FNMSUB xsum3, y06, xsum4, xtemp3
1360 FMADD xsum4, y05, xsum4, xtemp4
1362 FMADD xsum1, atemp1, a1, xsum1
1363 FMADD xsum2, atemp2, a1, xsum2
1364 FMADD xsum3, atemp1, a5, xsum3
1365 FMADD xsum4, atemp2, a5, xsum4
1368 FMADD1 xsum1, atemp2, a2, xsum1
1369 FMADD2 xsum2, atemp1, a2, xsum2
1371 FMADD1 xsum3, atemp2, a6, xsum3
1372 FMADD2 xsum4, atemp1, a6, xsum4
1374 FMADD xsum1, atemp3, a5, xsum1
1375 FMADD xsum2, atemp4, a5, xsum2
1376 FMADD xsum3, atemp3, a7, xsum3
1377 FMADD xsum4, atemp4, a7, xsum4
1379 FNMSUB xsum1, atemp4, a6, xsum1
1380 FMADD xsum2, atemp3, a6, xsum2
1382 FNMSUB xsum3, atemp4, a8, xsum3
1383 FMADD xsum4, atemp3, a8, xsum4
1386 FADD y01, y01, xsum1
1387 FADD y02, y02, xsum2
1388 FADD y03, y03, xsum3
1389 FADD y04, y04, xsum4
1391 STFD y01, 0 * SIZE(YY)
1393 STFD y02, 1 * SIZE(YY)
1395 STFD y03, 2 * SIZE(YY)
1397 STFD y04, 3 * SIZE(YY)
1407 slwi TEMP, IS, ZBASE_SHIFT
1413 LFD xtemp1, 0 * SIZE(TEMP)
1414 LFD xtemp2, 1 * SIZE(TEMP)
1416 FMUL atemp1, y05, xtemp1
1417 FMUL atemp2, y06, xtemp1
1419 FNMSUB atemp1, y06, xtemp2, atemp1
1420 FMADD atemp2, y05, xtemp2, atemp2
1428 LFD a1, 0 * SIZE(AO1)
1429 LFD a2, 1 * SIZE(AO1)
1431 LFD xtemp1, 0 * SIZE(XX)
1432 LFD xtemp2, 1 * SIZE(XX)
1434 LFD y01, 0 * SIZE(YY)
1435 LFD y02, 1 * SIZE(YY)
1443 FMADD xsum1, xtemp1, a1, xsum1
1444 FMADD y01, atemp1, a1, y01
1445 FMADD xsum2, xtemp2, a1, xsum2
1446 FMADD y02, atemp2, a1, y02
1447 LFD a1, 2 * SIZE(AO1)
1449 FMADD1 xsum1, xtemp2, a2, xsum1
1450 LFD xtemp2, 3 * SIZE(XX)
1451 FNMSUB y01, atemp2, a2, y01
1452 FMADD2 xsum2, xtemp1, a2, xsum2
1453 LFD xtemp1, 2 * SIZE(XX)
1454 FMADD y02, atemp1, a2, y02
1455 LFD a2, 3 * SIZE(AO1)
1457 addi AO1, AO1, 2 * SIZE
1458 addi XX, XX, 2 * SIZE
1459 addi YY, YY, 2 * SIZE
1461 STFD y01, -2 * SIZE(YY)
1462 LFD y01, 0 * SIZE(YY)
1463 STFD y02, -1 * SIZE(YY)
1464 LFD y02, 1 * SIZE(YY)
1472 FMUL xtemp1, y05, xsum1
1473 FMUL xtemp2, y06, xsum1
1475 FNMSUB xsum1, y06, xsum2, xtemp1
1476 FMADD xsum2, y05, xsum2, xtemp2
1478 FMADD xsum1, atemp1, a1, xsum1
1479 FMADD xsum2, atemp2, a1, xsum2
1482 FNMSUB xsum1, atemp2, a2, xsum1
1483 FMADD xsum2, atemp1, a2, xsum2
1486 FADD y01, y01, xsum1
1487 FADD y02, y02, xsum2
1489 STFD y01, 0 * SIZE(YY)
1490 STFD y02, 1 * SIZE(YY)
1494 cmpwi cr0, INCY, 2 * SIZE
1518 LFD f8, 0 * SIZE(NEW_Y)
1519 LFD f9, 1 * SIZE(NEW_Y)
1520 LFD f10, 2 * SIZE(NEW_Y)
1521 LFD f11, 3 * SIZE(NEW_Y)
1522 LFD f12, 4 * SIZE(NEW_Y)
1523 LFD f13, 5 * SIZE(NEW_Y)
1524 LFD f14, 6 * SIZE(NEW_Y)
1525 LFD f15, 7 * SIZE(NEW_Y)
1526 addi NEW_Y, NEW_Y, 8 * SIZE
1537 STFD f8, 0 * SIZE(YY)
1538 STFD f9, 1 * SIZE(YY)
1540 STFD f10, 0 * SIZE(YY)
1541 STFD f11, 1 * SIZE(YY)
1543 STFD f12, 0 * SIZE(YY)
1544 STFD f13, 1 * SIZE(YY)
1546 STFD f14, 0 * SIZE(YY)
1547 STFD f15, 1 * SIZE(YY)
1563 LFD f8, 0 * SIZE(NEW_Y)
1564 LFD f9, 1 * SIZE(NEW_Y)
1565 LFD f10, 2 * SIZE(NEW_Y)
1566 LFD f11, 3 * SIZE(NEW_Y)
1567 addi NEW_Y, NEW_Y, 4 * SIZE
1574 STFD f8, 0 * SIZE(YY)
1575 STFD f9, 1 * SIZE(YY)
1577 STFD f10, 0 * SIZE(YY)
1578 STFD f11, 1 * SIZE(YY)
1589 LFD f8, 0 * SIZE(NEW_Y)
1590 LFD f9, 1 * SIZE(NEW_Y)
1595 STFD f8, 0 * SIZE(YY)
1596 STFD f9, 1 * SIZE(YY)
1653 addi SP, SP, STACKSIZE