1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
66 #if defined(_AIX) || defined(__APPLE__)
67 #if !defined(__64BIT__) && defined(DOUBLE)
145 #define PREFETCHSIZE_A 24
148 #if defined(PPC440) || defined(PPC440FP2)
149 #define PREFETCHSIZE_A 24
153 #define PREFETCHSIZE_A 32
157 #define PREFETCHSIZE_A 72
161 #define PREFETCHSIZE_A 16
165 #define PREFETCHSIZE_A 96
169 #define PREFETCHSIZE_A 112
173 #define PREFETCHSIZE_A 112
176 #if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8)
180 #define NOP1 mr LDA, LDA
181 #define NOP2 mr INCX, INCX
187 #define STACKSIZE 224
188 #define ALPHA_R 200(SP)
189 #define ALPHA_I 208(SP)
190 #define FZERO 216(SP)
192 #define STACKSIZE 280
193 #define ALPHA_R 256(SP)
194 #define ALPHA_I 264(SP)
195 #define FZERO 272(SP)
199 #define FMADD1 FNMSUB
203 #define FMADD2 FNMSUB
209 addi SP, SP, -STACKSIZE
268 lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
270 ld Y, FRAMESLOT(0) + STACKSIZE(SP)
271 ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
272 ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
276 #if defined(_AIX) || defined(__APPLE__)
279 lwz X, FRAMESLOT(0) + STACKSIZE(SP)
280 lwz INCX, FRAMESLOT(1) + STACKSIZE(SP)
281 lwz Y, FRAMESLOT(2) + STACKSIZE(SP)
282 lwz INCY, FRAMESLOT(3) + STACKSIZE(SP)
283 lwz BUFFER, FRAMESLOT(4) + STACKSIZE(SP)
285 lwz Y, FRAMESLOT(0) + STACKSIZE(SP)
286 lwz INCY, FRAMESLOT(1) + STACKSIZE(SP)
287 lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
290 ld Y, FRAMESLOT(0) + STACKSIZE(SP)
291 ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
292 ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
296 STFD alpha_r, ALPHA_R
297 STFD alpha_i, ALPHA_I
299 slwi LDA, LDA, ZBASE_SHIFT
300 slwi INCX, INCX, ZBASE_SHIFT
301 slwi INCY, INCY, ZBASE_SHIFT
303 li PREA, PREFETCHSIZE_A * SIZE
308 cmpwi cr0, INCX, 2 * SIZE
336 STFD a1, 0 * SIZE(BUFFER)
337 STFD a2, 1 * SIZE(BUFFER)
338 STFD a3, 2 * SIZE(BUFFER)
339 STFD a4, 3 * SIZE(BUFFER)
340 STFD a5, 4 * SIZE(BUFFER)
341 STFD a6, 5 * SIZE(BUFFER)
342 STFD a7, 6 * SIZE(BUFFER)
343 STFD a8, 7 * SIZE(BUFFER)
345 addi BUFFER, BUFFER, 8 * SIZE
360 STFD a1, 0 * SIZE(BUFFER)
361 STFD a2, 1 * SIZE(BUFFER)
363 addi BUFFER, BUFFER, 2 * SIZE
371 cmpwi cr0, INCY, 2 * SIZE
382 STFD f0, 0 * SIZE(BUFFER)
383 STFD f0, 1 * SIZE(BUFFER)
384 STFD f0, 2 * SIZE(BUFFER)
385 STFD f0, 3 * SIZE(BUFFER)
386 STFD f0, 4 * SIZE(BUFFER)
387 STFD f0, 5 * SIZE(BUFFER)
388 STFD f0, 6 * SIZE(BUFFER)
389 STFD f0, 7 * SIZE(BUFFER)
390 addi BUFFER, BUFFER, 8 * SIZE
403 slwi TEMP, IS, ZBASE_SHIFT
415 LFD atemp1, 0 * SIZE(XX)
416 LFD atemp2, 1 * SIZE(XX)
417 LFD atemp3, 2 * SIZE(XX)
418 LFD atemp4, 3 * SIZE(XX)
420 LFD a1, 0 * SIZE(AO1)
421 LFD a2, 1 * SIZE(AO1)
422 LFD a3, 2 * SIZE(AO1)
423 LFD a4, 3 * SIZE(AO1)
425 LFD a7, 2 * SIZE(AO2)
426 LFD a8, 3 * SIZE(AO2)
428 FMUL xsum1, atemp1, a1
429 addi AO2, AO2, 4 * SIZE
430 FMUL xsum2, atemp2, a1
431 LFD a1, 4 * SIZE(AO1)
432 FMUL xsum3, atemp1, a3
433 addi AO1, AO1, 4 * SIZE
434 FMUL xsum4, atemp2, a3
435 LFD a5, 0 * SIZE(AO2)
438 FNMSUB xsum1, atemp2, a2, xsum1
440 addi XX, XX, 4 * SIZE
442 FMADD xsum2, atemp1, a2, xsum2
444 LFD a2, 1 * SIZE(AO1)
445 FNMSUB xsum3, atemp2, a4, xsum3
446 addi YY, YY, 4 * SIZE
447 FMADD xsum4, atemp1, a4, xsum4
448 LFD a6, 1 * SIZE(AO2)
450 FMADD xsum1, atemp3, a3, xsum1
452 FMADD xsum2, atemp4, a3, xsum2
453 LFD a3, 2 * SIZE(AO1)
454 FMADD xsum3, atemp3, a7, xsum3
456 FMADD xsum4, atemp4, a7, xsum4
457 LFD a7, 2 * SIZE(AO2)
459 FMADD1 xsum1, atemp4, a4, xsum1
461 FMADD2 xsum2, atemp3, a4, xsum2
462 LFD a4, 3 * SIZE(AO1)
464 FMADD1 xsum3, atemp4, a8, xsum3
468 FMADD2 xsum4, atemp3, a8, xsum4
470 LFD a8, 3 * SIZE(AO2)
472 FMUL xtemp1, y05, atemp1
473 LFD y01, 0 * SIZE(YY)
474 FMUL xtemp2, y06, atemp1
475 LFD y02, 1 * SIZE(YY)
476 FMUL xtemp3, y05, atemp3
477 LFD y03, 2 * SIZE(YY)
478 FMUL xtemp4, y06, atemp3
479 LFD y04, 3 * SIZE(YY)
481 FNMSUB atemp1, y06, atemp2, xtemp1
482 LFD xtemp1, 0 * SIZE(XX)
483 FMADD atemp2, y05, atemp2, xtemp2
484 LFD xtemp2, 1 * SIZE(XX)
485 FNMSUB atemp3, y06, atemp4, xtemp3
486 LFD xtemp3, 2 * SIZE(XX)
487 FMADD atemp4, y05, atemp4, xtemp4
488 LFD xtemp4, 3 * SIZE(XX)
493 FMADD xsum1, xtemp1, a1, xsum1
495 FMADD y01, atemp1, a1, y01
498 FMADD xsum2, xtemp2, a1, xsum2
500 FMADD y02, atemp2, a1, y02
501 LFD a1, 4 * SIZE(AO1)
503 FMADD xsum3, xtemp1, a5, xsum3
505 FMADD y03, atemp1, a3, y03
508 FMADD xsum4, xtemp2, a5, xsum4
510 FMADD y04, atemp2, a3, y04
513 FMADD1 xsum1, xtemp2, a2, xsum1
514 LFD y05, 4 * SIZE(YY)
515 FNMSUB y01, atemp2, a2, y01
518 FMADD2 xsum2, xtemp1, a2, xsum2
519 LFD y06, 5 * SIZE(YY)
520 FMADD y02, atemp1, a2, y02
521 LFD a2, 5 * SIZE(AO1)
523 FMADD1 xsum3, xtemp2, a6, xsum3
524 LFD xtemp2, 5 * SIZE(XX)
525 FNMSUB y03, atemp2, a4, y03
528 FMADD2 xsum4, xtemp1, a6, xsum4
529 LFD xtemp1, 4 * SIZE(XX)
530 FMADD y04, atemp1, a4, y04
533 FMADD xsum1, xtemp3, a3, xsum1
534 LFD y07, 6 * SIZE(YY)
535 FMADD y01, atemp3, a5, y01
538 FMADD xsum2, xtemp4, a3, xsum2
539 LFD a3, 6 * SIZE(AO1)
540 FMADD y02, atemp4, a5, y02
541 LFD a5, 4 * SIZE(AO2)
543 FMADD xsum3, xtemp3, a7, xsum3
544 LFD y08, 7 * SIZE(YY)
545 FMADD y03, atemp3, a7, y03
548 FMADD xsum4, xtemp4, a7, xsum4
550 FMADD y04, atemp4, a7, y04
551 LFD a7, 6 * SIZE(AO2)
553 FMADD1 xsum1, xtemp4, a4, xsum1
555 FNMSUB y01, atemp4, a6, y01
559 FMADD2 xsum2, xtemp3, a4, xsum2
560 LFD a4, 7 * SIZE(AO1)
561 FMADD y02, atemp3, a6, y02
562 LFD a6, 5 * SIZE(AO2)
564 FMADD1 xsum3, xtemp4, a8, xsum3
565 LFD xtemp4, 7 * SIZE(XX)
566 FNMSUB y03, atemp4, a8, y03
569 FMADD2 xsum4, xtemp3, a8, xsum4
570 LFD xtemp3, 6 * SIZE(XX)
571 FMADD y04, atemp3, a8, y04
572 LFD a8, 7 * SIZE(AO2)
574 FMADD xsum1, xtemp1, a1, xsum1
575 STFD y01, 0 * SIZE(YY)
576 FMADD y05, atemp1, a1, y05
579 FMADD xsum2, xtemp2, a1, xsum2
580 STFD y02, 1 * SIZE(YY)
581 FMADD y06, atemp2, a1, y06
582 LFD a1, 8 * SIZE(AO1)
584 FMADD xsum3, xtemp1, a5, xsum3
585 STFD y03, 2 * SIZE(YY)
586 FMADD y07, atemp1, a3, y07
589 FMADD xsum4, xtemp2, a5, xsum4
590 STFD y04, 3 * SIZE(YY)
591 FMADD y08, atemp2, a3, y08
594 FMADD1 xsum1, xtemp2, a2, xsum1
595 LFD y01, 8 * SIZE(YY)
596 FNMSUB y05, atemp2, a2, y05
599 FMADD2 xsum2, xtemp1, a2, xsum2
600 LFD y02, 9 * SIZE(YY)
601 FMADD y06, atemp1, a2, y06
602 LFD a2, 9 * SIZE(AO1)
604 FMADD1 xsum3, xtemp2, a6, xsum3
605 LFD xtemp2, 9 * SIZE(XX)
606 FNMSUB y07, atemp2, a4, y07
609 FMADD2 xsum4, xtemp1, a6, xsum4
610 LFD xtemp1, 8 * SIZE(XX)
611 FMADD y08, atemp1, a4, y08
614 FMADD xsum1, xtemp3, a3, xsum1
615 LFD y03, 10 * SIZE(YY)
616 FMADD y05, atemp3, a5, y05
619 FMADD xsum2, xtemp4, a3, xsum2
620 LFD a3, 10 * SIZE(AO1)
621 FMADD y06, atemp4, a5, y06
622 LFD a5, 8 * SIZE(AO2)
624 FMADD xsum3, xtemp3, a7, xsum3
625 LFD y04, 11 * SIZE(YY)
626 FMADD y07, atemp3, a7, y07
629 FMADD xsum4, xtemp4, a7, xsum4
631 FMADD y08, atemp4, a7, y08
632 LFD a7, 10 * SIZE(AO2)
634 FMADD1 xsum1, xtemp4, a4, xsum1
636 FNMSUB y05, atemp4, a6, y05
639 FMADD2 xsum2, xtemp3, a4, xsum2
640 LFD a4, 11 * SIZE(AO1)
641 FMADD y06, atemp3, a6, y06
642 LFD a6, 9 * SIZE(AO2)
644 FMADD1 xsum3, xtemp4, a8, xsum3
645 LFD xtemp4, 11 * SIZE(XX)
646 FNMSUB y07, atemp4, a8, y07
651 FMADD2 xsum4, xtemp3, a8, xsum4
652 LFD xtemp3, 10 * SIZE(XX)
653 FMADD y08, atemp3, a8, y08
654 LFD a8, 11 * SIZE(AO2)
656 FMADD xsum1, xtemp1, a1, xsum1
657 STFD y05, 4 * SIZE(YY)
658 FMADD y01, atemp1, a1, y01
661 FMADD xsum2, xtemp2, a1, xsum2
662 STFD y06, 5 * SIZE(YY)
663 FMADD y02, atemp2, a1, y02
664 LFD a1, 12 * SIZE(AO1)
666 FMADD xsum3, xtemp1, a5, xsum3
667 STFD y07, 6 * SIZE(YY)
668 FMADD y03, atemp1, a3, y03
671 FMADD xsum4, xtemp2, a5, xsum4
672 STFD y08, 7 * SIZE(YY)
673 FMADD y04, atemp2, a3, y04
676 FMADD1 xsum1, xtemp2, a2, xsum1
677 LFD y05, 12 * SIZE(YY)
678 FNMSUB y01, atemp2, a2, y01
681 FMADD2 xsum2, xtemp1, a2, xsum2
682 LFD y06, 13 * SIZE(YY)
683 FMADD y02, atemp1, a2, y02
684 LFD a2, 13 * SIZE(AO1)
686 FMADD1 xsum3, xtemp2, a6, xsum3
687 LFD xtemp2, 13 * SIZE(XX)
688 FNMSUB y03, atemp2, a4, y03
691 FMADD2 xsum4, xtemp1, a6, xsum4
692 LFD xtemp1, 12 * SIZE(XX)
693 FMADD y04, atemp1, a4, y04
696 FMADD xsum1, xtemp3, a3, xsum1
697 LFD y07, 14 * SIZE(YY)
698 FMADD y01, atemp3, a5, y01
701 FMADD xsum2, xtemp4, a3, xsum2
702 LFD a3, 14 * SIZE(AO1)
703 FMADD y02, atemp4, a5, y02
704 LFD a5, 12 * SIZE(AO2)
706 FMADD xsum3, xtemp3, a7, xsum3
707 LFD y08, 15 * SIZE(YY)
708 FMADD y03, atemp3, a7, y03
711 FMADD xsum4, xtemp4, a7, xsum4
713 FMADD y04, atemp4, a7, y04
714 LFD a7, 14 * SIZE(AO2)
716 FMADD1 xsum1, xtemp4, a4, xsum1
718 FNMSUB y01, atemp4, a6, y01
722 FMADD2 xsum2, xtemp3, a4, xsum2
723 LFD a4, 15 * SIZE(AO1)
724 FMADD y02, atemp3, a6, y02
725 LFD a6, 13 * SIZE(AO2)
727 FMADD1 xsum3, xtemp4, a8, xsum3
728 LFD xtemp4, 15 * SIZE(XX)
729 FNMSUB y03, atemp4, a8, y03
732 FMADD2 xsum4, xtemp3, a8, xsum4
733 LFD xtemp3, 14 * SIZE(XX)
734 FMADD y04, atemp3, a8, y04
735 LFD a8, 15 * SIZE(AO2)
737 FMADD xsum1, xtemp1, a1, xsum1
738 STFD y01, 8 * SIZE(YY)
739 FMADD y05, atemp1, a1, y05
742 FMADD xsum2, xtemp2, a1, xsum2
743 STFD y02, 9 * SIZE(YY)
744 FMADD y06, atemp2, a1, y06
745 LFD a1, 16 * SIZE(AO1)
747 FMADD xsum3, xtemp1, a5, xsum3
748 STFD y03, 10 * SIZE(YY)
749 FMADD y07, atemp1, a3, y07
752 FMADD xsum4, xtemp2, a5, xsum4
753 STFD y04, 11 * SIZE(YY)
754 FMADD y08, atemp2, a3, y08
757 FMADD1 xsum1, xtemp2, a2, xsum1
758 LFD y01, 16 * SIZE(YY)
759 FNMSUB y05, atemp2, a2, y05
762 FMADD2 xsum2, xtemp1, a2, xsum2
763 LFD y02, 17 * SIZE(YY)
764 FMADD y06, atemp1, a2, y06
765 LFD a2, 17 * SIZE(AO1)
767 FMADD1 xsum3, xtemp2, a6, xsum3
768 LFD xtemp2, 17 * SIZE(XX)
769 FNMSUB y07, atemp2, a4, y07
772 FMADD2 xsum4, xtemp1, a6, xsum4
773 LFD xtemp1, 16 * SIZE(XX)
774 FMADD y08, atemp1, a4, y08
775 addi AO2, AO2, 16 * SIZE
777 FMADD xsum1, xtemp3, a3, xsum1
778 LFD y03, 18 * SIZE(YY)
779 FMADD y05, atemp3, a5, y05
780 addi XX, XX, 16 * SIZE
782 FMADD xsum2, xtemp4, a3, xsum2
783 LFD a3, 18 * SIZE(AO1)
784 FMADD y06, atemp4, a5, y06
785 LFD a5, 0 * SIZE(AO2)
787 FMADD xsum3, xtemp3, a7, xsum3
788 LFD y04, 19 * SIZE(YY)
789 FMADD y07, atemp3, a7, y07
792 FMADD xsum4, xtemp4, a7, xsum4
793 addi AO1, AO1, 16 * SIZE
794 FMADD y08, atemp4, a7, y08
795 LFD a7, 2 * SIZE(AO2)
797 FMADD1 xsum1, xtemp4, a4, xsum1
798 addi YY, YY, 16 * SIZE
799 FNMSUB y05, atemp4, a6, y05
802 FMADD2 xsum2, xtemp3, a4, xsum2
803 LFD a4, 3 * SIZE(AO1)
804 FMADD y06, atemp3, a6, y06
805 LFD a6, 1 * SIZE(AO2)
807 FMADD1 xsum3, xtemp4, a8, xsum3
808 LFD xtemp4, 3 * SIZE(XX)
809 FNMSUB y07, atemp4, a8, y07
812 FMADD2 xsum4, xtemp3, a8, xsum4
813 LFD xtemp3, 2 * SIZE(XX)
814 FMADD y08, atemp3, a8, y08
815 LFD a8, 3 * SIZE(AO2)
817 FMADD xsum1, xtemp1, a1, xsum1
818 STFD y05, -4 * SIZE(YY)
819 FMADD y01, atemp1, a1, y01
822 FMADD xsum2, xtemp2, a1, xsum2
823 STFD y06, -3 * SIZE(YY)
824 FMADD y02, atemp2, a1, y02
825 LFD a1, 4 * SIZE(AO1)
827 FMADD xsum3, xtemp1, a5, xsum3
828 STFD y07, -2 * SIZE(YY)
829 FMADD y03, atemp1, a3, y03
832 FMADD xsum4, xtemp2, a5, xsum4
833 STFD y08, -1 * SIZE(YY)
834 FMADD y04, atemp2, a3, y04
837 FMADD1 xsum1, xtemp2, a2, xsum1
838 LFD y05, 4 * SIZE(YY)
839 FNMSUB y01, atemp2, a2, y01
842 FMADD2 xsum2, xtemp1, a2, xsum2
843 LFD y06, 5 * SIZE(YY)
844 FMADD y02, atemp1, a2, y02
845 LFD a2, 5 * SIZE(AO1)
847 FMADD1 xsum3, xtemp2, a6, xsum3
848 LFD xtemp2, 5 * SIZE(XX)
849 FNMSUB y03, atemp2, a4, y03
852 FMADD2 xsum4, xtemp1, a6, xsum4
853 LFD xtemp1, 4 * SIZE(XX)
854 FMADD y04, atemp1, a4, y04
857 FMADD xsum1, xtemp3, a3, xsum1
858 LFD y07, 6 * SIZE(YY)
859 FMADD y01, atemp3, a5, y01
862 FMADD xsum2, xtemp4, a3, xsum2
863 LFD a3, 6 * SIZE(AO1)
864 FMADD y02, atemp4, a5, y02
865 LFD a5, 4 * SIZE(AO2)
867 FMADD xsum3, xtemp3, a7, xsum3
868 LFD y08, 7 * SIZE(YY)
869 FMADD y03, atemp3, a7, y03
872 FMADD xsum4, xtemp4, a7, xsum4
874 FMADD y04, atemp4, a7, y04
875 LFD a7, 6 * SIZE(AO2)
877 FMADD1 xsum1, xtemp4, a4, xsum1
879 FNMSUB y01, atemp4, a6, y01
883 FMADD2 xsum2, xtemp3, a4, xsum2
884 LFD a4, 7 * SIZE(AO1)
885 FMADD y02, atemp3, a6, y02
886 LFD a6, 5 * SIZE(AO2)
888 FMADD1 xsum3, xtemp4, a8, xsum3
889 LFD xtemp4, 7 * SIZE(XX)
890 FNMSUB y03, atemp4, a8, y03
893 FMADD2 xsum4, xtemp3, a8, xsum4
894 LFD xtemp3, 6 * SIZE(XX)
895 FMADD y04, atemp3, a8, y04
896 LFD a8, 7 * SIZE(AO2)
898 FMADD xsum1, xtemp1, a1, xsum1
899 STFD y01, 0 * SIZE(YY)
900 FMADD y05, atemp1, a1, y05
903 FMADD xsum2, xtemp2, a1, xsum2
904 STFD y02, 1 * SIZE(YY)
905 FMADD y06, atemp2, a1, y06
906 LFD a1, 8 * SIZE(AO1)
908 FMADD xsum3, xtemp1, a5, xsum3
909 STFD y03, 2 * SIZE(YY)
910 FMADD y07, atemp1, a3, y07
913 FMADD xsum4, xtemp2, a5, xsum4
914 STFD y04, 3 * SIZE(YY)
915 FMADD y08, atemp2, a3, y08
918 FMADD1 xsum1, xtemp2, a2, xsum1
919 LFD y01, 8 * SIZE(YY)
920 FNMSUB y05, atemp2, a2, y05
923 FMADD2 xsum2, xtemp1, a2, xsum2
924 LFD y02, 9 * SIZE(YY)
925 FMADD y06, atemp1, a2, y06
926 LFD a2, 9 * SIZE(AO1)
928 FMADD1 xsum3, xtemp2, a6, xsum3
929 LFD xtemp2, 9 * SIZE(XX)
930 FNMSUB y07, atemp2, a4, y07
933 FMADD2 xsum4, xtemp1, a6, xsum4
934 LFD xtemp1, 8 * SIZE(XX)
935 FMADD y08, atemp1, a4, y08
938 FMADD xsum1, xtemp3, a3, xsum1
939 LFD y03, 10 * SIZE(YY)
940 FMADD y05, atemp3, a5, y05
943 FMADD xsum2, xtemp4, a3, xsum2
944 LFD a3, 10 * SIZE(AO1)
945 FMADD y06, atemp4, a5, y06
946 LFD a5, 8 * SIZE(AO2)
948 FMADD xsum3, xtemp3, a7, xsum3
949 LFD y04, 11 * SIZE(YY)
950 FMADD y07, atemp3, a7, y07
953 FMADD xsum4, xtemp4, a7, xsum4
955 FMADD y08, atemp4, a7, y08
956 LFD a7, 10 * SIZE(AO2)
958 FMADD1 xsum1, xtemp4, a4, xsum1
960 FNMSUB y05, atemp4, a6, y05
963 FMADD2 xsum2, xtemp3, a4, xsum2
964 LFD a4, 11 * SIZE(AO1)
965 FMADD y06, atemp3, a6, y06
966 LFD a6, 9 * SIZE(AO2)
968 FMADD1 xsum3, xtemp4, a8, xsum3
969 LFD xtemp4, 11 * SIZE(XX)
970 FNMSUB y07, atemp4, a8, y07
975 FMADD2 xsum4, xtemp3, a8, xsum4
976 LFD xtemp3, 10 * SIZE(XX)
977 FMADD y08, atemp3, a8, y08
978 LFD a8, 11 * SIZE(AO2)
980 FMADD xsum1, xtemp1, a1, xsum1
981 STFD y05, 4 * SIZE(YY)
982 FMADD y01, atemp1, a1, y01
985 FMADD xsum2, xtemp2, a1, xsum2
986 STFD y06, 5 * SIZE(YY)
987 FMADD y02, atemp2, a1, y02
988 LFD a1, 12 * SIZE(AO1)
990 FMADD xsum3, xtemp1, a5, xsum3
991 STFD y07, 6 * SIZE(YY)
992 FMADD y03, atemp1, a3, y03
995 FMADD xsum4, xtemp2, a5, xsum4
996 STFD y08, 7 * SIZE(YY)
997 FMADD y04, atemp2, a3, y04
1000 FMADD1 xsum1, xtemp2, a2, xsum1
1001 LFD y05, 12 * SIZE(YY)
1002 FNMSUB y01, atemp2, a2, y01
1005 FMADD2 xsum2, xtemp1, a2, xsum2
1006 LFD y06, 13 * SIZE(YY)
1007 FMADD y02, atemp1, a2, y02
1008 LFD a2, 13 * SIZE(AO1)
1010 FMADD1 xsum3, xtemp2, a6, xsum3
1011 LFD xtemp2, 13 * SIZE(XX)
1012 FNMSUB y03, atemp2, a4, y03
1015 FMADD2 xsum4, xtemp1, a6, xsum4
1016 LFD xtemp1, 12 * SIZE(XX)
1017 FMADD y04, atemp1, a4, y04
1020 FMADD xsum1, xtemp3, a3, xsum1
1021 LFD y07, 14 * SIZE(YY)
1022 FMADD y01, atemp3, a5, y01
1025 FMADD xsum2, xtemp4, a3, xsum2
1026 LFD a3, 14 * SIZE(AO1)
1027 FMADD y02, atemp4, a5, y02
1028 LFD a5, 12 * SIZE(AO2)
1030 FMADD xsum3, xtemp3, a7, xsum3
1031 LFD y08, 15 * SIZE(YY)
1032 FMADD y03, atemp3, a7, y03
1035 FMADD xsum4, xtemp4, a7, xsum4
1037 FMADD y04, atemp4, a7, y04
1038 LFD a7, 14 * SIZE(AO2)
1040 FMADD1 xsum1, xtemp4, a4, xsum1
1042 FNMSUB y01, atemp4, a6, y01
1045 FMADD2 xsum2, xtemp3, a4, xsum2
1046 LFD a4, 15 * SIZE(AO1)
1047 FMADD y02, atemp3, a6, y02
1048 LFD a6, 13 * SIZE(AO2)
1050 FMADD1 xsum3, xtemp4, a8, xsum3
1051 LFD xtemp4, 15 * SIZE(XX)
1052 FNMSUB y03, atemp4, a8, y03
1055 FMADD2 xsum4, xtemp3, a8, xsum4
1056 LFD xtemp3, 14 * SIZE(XX)
1057 FMADD y04, atemp3, a8, y04
1058 LFD a8, 15 * SIZE(AO2)
1060 FMADD xsum1, xtemp1, a1, xsum1
1061 STFD y01, 8 * SIZE(YY)
1062 FMADD y05, atemp1, a1, y05
1065 FMADD xsum2, xtemp2, a1, xsum2
1066 STFD y02, 9 * SIZE(YY)
1067 FMADD y06, atemp2, a1, y06
1068 LFD a1, 16 * SIZE(AO1)
1070 FMADD xsum3, xtemp1, a5, xsum3
1071 STFD y03, 10 * SIZE(YY)
1072 FMADD y07, atemp1, a3, y07
1075 FMADD xsum4, xtemp2, a5, xsum4
1076 STFD y04, 11 * SIZE(YY)
1077 FMADD y08, atemp2, a3, y08
1080 FMADD1 xsum1, xtemp2, a2, xsum1
1081 LFD y01, 16 * SIZE(YY)
1082 FNMSUB y05, atemp2, a2, y05
1085 FMADD2 xsum2, xtemp1, a2, xsum2
1086 LFD y02, 17 * SIZE(YY)
1087 FMADD y06, atemp1, a2, y06
1088 LFD a2, 17 * SIZE(AO1)
1090 FMADD1 xsum3, xtemp2, a6, xsum3
1091 LFD xtemp2, 17 * SIZE(XX)
1092 FNMSUB y07, atemp2, a4, y07
1095 FMADD2 xsum4, xtemp1, a6, xsum4
1096 LFD xtemp1, 16 * SIZE(XX)
1097 FMADD y08, atemp1, a4, y08
1098 addi AO2, AO2, 16 * SIZE
1100 FMADD xsum1, xtemp3, a3, xsum1
1101 LFD y03, 18 * SIZE(YY)
1102 FMADD y05, atemp3, a5, y05
1103 addi XX, XX, 16 * SIZE
1105 FMADD xsum2, xtemp4, a3, xsum2
1106 LFD a3, 18 * SIZE(AO1)
1107 FMADD y06, atemp4, a5, y06
1108 LFD a5, 0 * SIZE(AO2)
1110 FMADD xsum3, xtemp3, a7, xsum3
1111 LFD y04, 19 * SIZE(YY)
1112 FMADD y07, atemp3, a7, y07
1115 FMADD xsum4, xtemp4, a7, xsum4
1116 addi AO1, AO1, 16 * SIZE
1117 FMADD y08, atemp4, a7, y08
1118 LFD a7, 2 * SIZE(AO2)
1120 FMADD1 xsum1, xtemp4, a4, xsum1
1121 addi YY, YY, 16 * SIZE
1122 FNMSUB y05, atemp4, a6, y05
1125 FMADD2 xsum2, xtemp3, a4, xsum2
1126 LFD a4, 3 * SIZE(AO1)
1127 FMADD y06, atemp3, a6, y06
1128 LFD a6, 1 * SIZE(AO2)
1130 FMADD1 xsum3, xtemp4, a8, xsum3
1131 LFD xtemp4, 3 * SIZE(XX)
1132 FNMSUB y07, atemp4, a8, y07
1135 FMADD2 xsum4, xtemp3, a8, xsum4
1136 LFD xtemp3, 2 * SIZE(XX)
1137 FMADD y08, atemp3, a8, y08
1138 LFD a8, 3 * SIZE(AO2)
1140 STFD y05, -4 * SIZE(YY)
1141 STFD y06, -3 * SIZE(YY)
1142 STFD y07, -2 * SIZE(YY)
1143 STFD y08, -1 * SIZE(YY)
1150 FMADD xsum1, xtemp1, a1, xsum1
1152 FMADD y01, atemp1, a1, y01
1155 FMADD xsum2, xtemp2, a1, xsum2
1157 FMADD y02, atemp2, a1, y02
1158 LFD a1, 4 * SIZE(AO1)
1160 FMADD xsum3, xtemp1, a5, xsum3
1162 FMADD y03, atemp1, a3, y03
1165 FMADD xsum4, xtemp2, a5, xsum4
1167 FMADD y04, atemp2, a3, y04
1170 FMADD1 xsum1, xtemp2, a2, xsum1
1171 LFD y05, 4 * SIZE(YY)
1172 FNMSUB y01, atemp2, a2, y01
1175 FMADD2 xsum2, xtemp1, a2, xsum2
1176 LFD y06, 5 * SIZE(YY)
1177 FMADD y02, atemp1, a2, y02
1178 LFD a2, 5 * SIZE(AO1)
1180 FMADD1 xsum3, xtemp2, a6, xsum3
1181 LFD xtemp2, 5 * SIZE(XX)
1182 FNMSUB y03, atemp2, a4, y03
1185 FMADD2 xsum4, xtemp1, a6, xsum4
1186 LFD xtemp1, 4 * SIZE(XX)
1187 FMADD y04, atemp1, a4, y04
1190 FMADD xsum1, xtemp3, a3, xsum1
1191 LFD y07, 6 * SIZE(YY)
1192 FMADD y01, atemp3, a5, y01
1195 FMADD xsum2, xtemp4, a3, xsum2
1196 LFD a3, 6 * SIZE(AO1)
1197 FMADD y02, atemp4, a5, y02
1198 LFD a5, 4 * SIZE(AO2)
1200 FMADD xsum3, xtemp3, a7, xsum3
1201 LFD y08, 7 * SIZE(YY)
1202 FMADD y03, atemp3, a7, y03
1205 FMADD xsum4, xtemp4, a7, xsum4
1207 FMADD y04, atemp4, a7, y04
1208 LFD a7, 6 * SIZE(AO2)
1210 FMADD1 xsum1, xtemp4, a4, xsum1
1212 FNMSUB y01, atemp4, a6, y01
1215 FMADD2 xsum2, xtemp3, a4, xsum2
1216 LFD a4, 7 * SIZE(AO1)
1217 FMADD y02, atemp3, a6, y02
1218 LFD a6, 5 * SIZE(AO2)
1220 FMADD1 xsum3, xtemp4, a8, xsum3
1221 LFD xtemp4, 7 * SIZE(XX)
1222 FNMSUB y03, atemp4, a8, y03
1225 FMADD2 xsum4, xtemp3, a8, xsum4
1226 LFD xtemp3, 6 * SIZE(XX)
1227 FMADD y04, atemp3, a8, y04
1228 LFD a8, 7 * SIZE(AO2)
1230 FMADD xsum1, xtemp1, a1, xsum1
1231 STFD y01, 0 * SIZE(YY)
1232 FMADD y05, atemp1, a1, y05
1235 FMADD xsum2, xtemp2, a1, xsum2
1236 STFD y02, 1 * SIZE(YY)
1237 FMADD y06, atemp2, a1, y06
1238 LFD a1, 8 * SIZE(AO1)
1240 FMADD xsum3, xtemp1, a5, xsum3
1241 STFD y03, 2 * SIZE(YY)
1242 FMADD y07, atemp1, a3, y07
1245 FMADD xsum4, xtemp2, a5, xsum4
1246 STFD y04, 3 * SIZE(YY)
1247 FMADD y08, atemp2, a3, y08
1250 FMADD1 xsum1, xtemp2, a2, xsum1
1251 LFD y01, 8 * SIZE(YY)
1252 FNMSUB y05, atemp2, a2, y05
1255 FMADD2 xsum2, xtemp1, a2, xsum2
1256 LFD y02, 9 * SIZE(YY)
1257 FMADD y06, atemp1, a2, y06
1258 LFD a2, 9 * SIZE(AO1)
1260 FMADD1 xsum3, xtemp2, a6, xsum3
1261 LFD xtemp2, 9 * SIZE(XX)
1262 FNMSUB y07, atemp2, a4, y07
1265 FMADD2 xsum4, xtemp1, a6, xsum4
1266 LFD xtemp1, 8 * SIZE(XX)
1267 FMADD y08, atemp1, a4, y08
1270 FMADD xsum1, xtemp3, a3, xsum1
1271 LFD y03, 10 * SIZE(YY)
1272 FMADD y05, atemp3, a5, y05
1275 FMADD xsum2, xtemp4, a3, xsum2
1276 LFD a3, 10 * SIZE(AO1)
1277 FMADD y06, atemp4, a5, y06
1278 LFD a5, 8 * SIZE(AO2)
1280 FMADD xsum3, xtemp3, a7, xsum3
1281 LFD y04, 11 * SIZE(YY)
1282 FMADD y07, atemp3, a7, y07
1285 FMADD xsum4, xtemp4, a7, xsum4
1286 addi YY, YY, 8 * SIZE
1287 FMADD y08, atemp4, a7, y08
1288 LFD a7, 10 * SIZE(AO2)
1290 FMADD1 xsum1, xtemp4, a4, xsum1
1291 addi AO2, AO2, 8 * SIZE
1292 FNMSUB y05, atemp4, a6, y05
1293 addi XX, XX, 8 * SIZE
1295 FMADD2 xsum2, xtemp3, a4, xsum2
1296 LFD a4, 11 * SIZE(AO1)
1297 FMADD y06, atemp3, a6, y06
1298 LFD a6, 1 * SIZE(AO2)
1300 FMADD1 xsum3, xtemp4, a8, xsum3
1301 LFD xtemp4, 3 * SIZE(XX)
1302 FNMSUB y07, atemp4, a8, y07
1303 addi AO1, AO1, 8 * SIZE
1305 FMADD2 xsum4, xtemp3, a8, xsum4
1306 LFD xtemp3, 2 * SIZE(XX)
1307 FMADD y08, atemp3, a8, y08
1308 LFD a8, 3 * SIZE(AO2)
1310 STFD y05, -4 * SIZE(YY)
1311 STFD y06, -3 * SIZE(YY)
1312 STFD y07, -2 * SIZE(YY)
1313 STFD y08, -1 * SIZE(YY)
1320 FMADD xsum1, xtemp1, a1, xsum1
1322 FMADD y01, atemp1, a1, y01
1325 FMADD xsum2, xtemp2, a1, xsum2
1327 FMADD y02, atemp2, a1, y02
1328 LFD a1, 4 * SIZE(AO1)
1330 FMADD xsum3, xtemp1, a5, xsum3
1331 FMADD y03, atemp1, a3, y03
1332 FMADD xsum4, xtemp2, a5, xsum4
1333 FMADD y04, atemp2, a3, y04
1335 FMADD1 xsum1, xtemp2, a2, xsum1
1337 FNMSUB y01, atemp2, a2, y01
1340 FMADD2 xsum2, xtemp1, a2, xsum2
1342 FMADD y02, atemp1, a2, y02
1343 LFD a2, 5 * SIZE(AO1)
1345 FMADD1 xsum3, xtemp2, a6, xsum3
1346 LFD xtemp2, 5 * SIZE(XX)
1347 FNMSUB y03, atemp2, a4, y03
1350 FMADD2 xsum4, xtemp1, a6, xsum4
1351 LFD xtemp1, 4 * SIZE(XX)
1352 FMADD y04, atemp1, a4, y04
1355 FMADD xsum1, xtemp3, a3, xsum1
1357 FMADD y01, atemp3, a5, y01
1360 FMADD xsum2, xtemp4, a3, xsum2
1362 FMADD y02, atemp4, a5, y02
1363 LFD a5, 4 * SIZE(AO2)
1365 FMADD xsum3, xtemp3, a7, xsum3
1366 FMADD y03, atemp3, a7, y03
1367 FMADD xsum4, xtemp4, a7, xsum4
1368 FMADD y04, atemp4, a7, y04
1370 FMADD1 xsum1, xtemp4, a4, xsum1
1372 FNMSUB y01, atemp4, a6, y01
1375 FMADD2 xsum2, xtemp3, a4, xsum2
1377 FMADD y02, atemp3, a6, y02
1378 LFD a6, 5 * SIZE(AO2)
1380 FMADD1 xsum3, xtemp4, a8, xsum3
1381 addi AO1, AO1, 4 * SIZE
1382 FNMSUB y03, atemp4, a8, y03
1383 addi AO2, AO2, 4 * SIZE
1384 FMADD2 xsum4, xtemp3, a8, xsum4
1385 addi YY, YY, 4 * SIZE
1386 FMADD y04, atemp3, a8, y04
1389 STFD y01, -4 * SIZE(YY)
1390 LFD y01, 0 * SIZE(YY)
1391 STFD y02, -3 * SIZE(YY)
1392 LFD y02, 1 * SIZE(YY)
1394 STFD y03, -2 * SIZE(YY)
1395 STFD y04, -1 * SIZE(YY)
1402 FMADD xsum1, xtemp1, a1, xsum1
1403 FMADD y01, atemp1, a1, y01
1404 FMADD xsum2, xtemp2, a1, xsum2
1405 FMADD y02, atemp2, a1, y02
1406 FMADD xsum3, xtemp1, a5, xsum3
1407 FNMSUB y01, atemp2, a2, y01
1408 FMADD xsum4, xtemp2, a5, xsum4
1409 FMADD y02, atemp1, a2, y02
1411 FMADD1 xsum1, xtemp2, a2, xsum1
1412 FMADD y01, atemp3, a5, y01
1413 FMADD2 xsum2, xtemp1, a2, xsum2
1414 FMADD y02, atemp4, a5, y02
1415 FMADD1 xsum3, xtemp2, a6, xsum3
1416 FNMSUB y01, atemp4, a6, y01
1417 FMADD2 xsum4, xtemp1, a6, xsum4
1418 FMADD y02, atemp3, a6, y02
1420 STFD y01, 0 * SIZE(YY)
1421 STFD y02, 1 * SIZE(YY)
1422 STFD y03, 2 * SIZE(YY)
1423 STFD y04, 3 * SIZE(YY)
1430 slwi TEMP, IS, ZBASE_SHIFT
1433 LFD y01, 0 * SIZE(YY)
1434 LFD y02, 1 * SIZE(YY)
1435 LFD y03, 2 * SIZE(YY)
1436 LFD y04, 3 * SIZE(YY)
1438 FMUL xtemp1, y05, xsum1
1439 FMUL xtemp2, y06, xsum1
1440 FMUL xtemp3, y05, xsum3
1441 FMUL xtemp4, y06, xsum3
1443 FNMSUB xsum1, y06, xsum2, xtemp1
1444 FMADD xsum2, y05, xsum2, xtemp2
1445 FNMSUB xsum3, y06, xsum4, xtemp3
1446 FMADD xsum4, y05, xsum4, xtemp4
1448 FADD y01, y01, xsum1
1449 FADD y02, y02, xsum2
1450 FADD y03, y03, xsum3
1451 FADD y04, y04, xsum4
1453 STFD y01, 0 * SIZE(YY)
1455 STFD y02, 1 * SIZE(YY)
1457 STFD y03, 2 * SIZE(YY)
1459 STFD y04, 3 * SIZE(YY)
1467 slwi TEMP, IS, ZBASE_SHIFT
1474 LFD atemp1, 0 * SIZE(XX)
1475 LFD atemp2, 1 * SIZE(XX)
1480 FMUL xsum1, atemp1, a1
1481 FMUL xsum2, atemp2, a1
1484 FNMSUB xsum1, atemp2, a2, xsum1
1485 FMADD xsum2, atemp1, a2, xsum2
1488 FMUL xtemp1, y05, atemp1
1489 FMUL xtemp2, y06, atemp1
1491 FNMSUB atemp1, y06, atemp2, xtemp1
1492 FMADD atemp2, y05, atemp2, xtemp2
1497 LFD y01, 0 * SIZE(YY)
1498 LFD y02, 1 * SIZE(YY)
1500 FMUL xtemp1, y05, xsum1
1501 FMUL xtemp2, y06, xsum1
1503 FNMSUB xsum1, y06, xsum2, xtemp1
1504 FMADD xsum2, y05, xsum2, xtemp2
1506 FADD y01, y01, xsum1
1507 FADD y02, y02, xsum2
1509 STFD y01, 0 * SIZE(YY)
1510 STFD y02, 1 * SIZE(YY)
1514 cmpwi cr0, INCY, 2 * SIZE
1538 LFD f8, 0 * SIZE(NEW_Y)
1539 LFD f9, 1 * SIZE(NEW_Y)
1540 LFD f10, 2 * SIZE(NEW_Y)
1541 LFD f11, 3 * SIZE(NEW_Y)
1542 LFD f12, 4 * SIZE(NEW_Y)
1543 LFD f13, 5 * SIZE(NEW_Y)
1544 LFD f14, 6 * SIZE(NEW_Y)
1545 LFD f15, 7 * SIZE(NEW_Y)
1546 addi NEW_Y, NEW_Y, 8 * SIZE
1557 STFD f8, 0 * SIZE(YY)
1558 STFD f9, 1 * SIZE(YY)
1560 STFD f10, 0 * SIZE(YY)
1561 STFD f11, 1 * SIZE(YY)
1563 STFD f12, 0 * SIZE(YY)
1564 STFD f13, 1 * SIZE(YY)
1566 STFD f14, 0 * SIZE(YY)
1567 STFD f15, 1 * SIZE(YY)
1583 LFD f8, 0 * SIZE(NEW_Y)
1584 LFD f9, 1 * SIZE(NEW_Y)
1585 LFD f10, 2 * SIZE(NEW_Y)
1586 LFD f11, 3 * SIZE(NEW_Y)
1587 addi NEW_Y, NEW_Y, 4 * SIZE
1594 STFD f8, 0 * SIZE(YY)
1595 STFD f9, 1 * SIZE(YY)
1597 STFD f10, 0 * SIZE(YY)
1598 STFD f11, 1 * SIZE(YY)
1609 LFD f8, 0 * SIZE(NEW_Y)
1610 LFD f9, 1 * SIZE(NEW_Y)
1615 STFD f8, 0 * SIZE(YY)
1616 STFD f9, 1 * SIZE(YY)
1673 addi SP, SP, STACKSIZE