1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
66 #if defined(_AIX) || defined(__APPLE__)
67 #if !defined(__64BIT__) && defined(DOUBLE)
143 #define PREFETCHSIZE_A 24
146 #if defined(PPC440) || defined(PPC440FP2)
147 #define PREFETCHSIZE_A 24
151 #define PREFETCHSIZE_A 64
155 #define PREFETCHSIZE_A 72
159 #define PREFETCHSIZE_A 16
163 #define PREFETCHSIZE_A 96
167 #define PREFETCHSIZE_A 40
170 #if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
174 #define NOP1 mr LDA, LDA
175 #define NOP2 mr INCX, INCX
181 #define STACKSIZE 224
182 #define ALPHA 200(SP)
183 #define FZERO 208(SP)
185 #define STACKSIZE 280
186 #define ALPHA 256(SP)
187 #define FZERO 264(SP)
193 addi SP, SP, -STACKSIZE
252 lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
254 ld INCY, FRAMESLOT(0) + STACKSIZE(SP)
255 ld BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
259 #if defined(_AIX) || defined(__APPLE__)
262 lwz Y, FRAMESLOT(0) + STACKSIZE(SP)
263 lwz INCY, FRAMESLOT(1) + STACKSIZE(SP)
264 lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
266 lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
267 lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
270 ld INCY, FRAMESLOT(0) + STACKSIZE(SP)
271 ld BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
277 slwi LDA, LDA, BASE_SHIFT
278 slwi INCX, INCX, BASE_SHIFT
279 slwi INCY, INCY, BASE_SHIFT
281 li PREA, PREFETCHSIZE_A * SIZE
290 cmpwi cr0, INCX, SIZE
322 STFD a1, 0 * SIZE(BUFFER)
323 STFD a2, 1 * SIZE(BUFFER)
324 STFD a3, 2 * SIZE(BUFFER)
325 STFD a4, 3 * SIZE(BUFFER)
326 STFD a5, 4 * SIZE(BUFFER)
327 STFD a6, 5 * SIZE(BUFFER)
328 STFD a7, 6 * SIZE(BUFFER)
329 STFD a8, 7 * SIZE(BUFFER)
331 addi BUFFER, BUFFER, 8 * SIZE
345 STFD a1, 0 * SIZE(BUFFER)
346 addi BUFFER, BUFFER, 1 * SIZE
354 cmpwi cr0, INCY, SIZE
365 STFD f0, 0 * SIZE(BUFFER)
366 STFD f0, 1 * SIZE(BUFFER)
367 STFD f0, 2 * SIZE(BUFFER)
368 STFD f0, 3 * SIZE(BUFFER)
369 STFD f0, 4 * SIZE(BUFFER)
370 STFD f0, 5 * SIZE(BUFFER)
371 STFD f0, 6 * SIZE(BUFFER)
372 STFD f0, 7 * SIZE(BUFFER)
373 addi BUFFER, BUFFER, 8 * SIZE
390 slwi TEMP, IS, BASE_SHIFT
396 LFD atemp1, 0 * SIZE(TEMP)
397 LFD atemp2, 1 * SIZE(TEMP)
398 LFD atemp3, 2 * SIZE(TEMP)
399 LFD atemp4, 3 * SIZE(TEMP)
401 LFD xtemp1, 0 * SIZE(X)
402 LFD xtemp2, 1 * SIZE(X)
403 LFD xtemp3, 2 * SIZE(X)
404 LFD xtemp4, 3 * SIZE(X)
406 LFD y01, 0 * SIZE(NEW_Y)
407 LFD y02, 1 * SIZE(NEW_Y)
408 LFD y03, 2 * SIZE(NEW_Y)
409 LFD y04, 3 * SIZE(NEW_Y)
411 LFD a1, 0 * SIZE(AO1)
412 FMUL atemp1, a16, atemp1
413 LFD a2, 1 * SIZE(AO1)
414 FMUL atemp2, a16, atemp2
415 LFD a3, 2 * SIZE(AO1)
416 FMUL atemp3, a16, atemp3
417 LFD a4, 3 * SIZE(AO1)
418 FMUL atemp4, a16, atemp4
420 LFD a5, 0 * SIZE(AO2)
422 LFD a6, 1 * SIZE(AO2)
424 LFD a7, 2 * SIZE(AO2)
426 LFD a8, 3 * SIZE(AO2)
428 LFD a9, 0 * SIZE(AO3)
429 LFD a10, 1 * SIZE(AO3)
430 LFD a11, 2 * SIZE(AO3)
431 LFD a12, 3 * SIZE(AO3)
433 LFD a13, 0 * SIZE(AO4)
434 LFD a14, 1 * SIZE(AO4)
435 LFD a15, 2 * SIZE(AO4)
436 LFD a16, 3 * SIZE(AO4)
447 FMADD xsum1, xtemp1, a1, xsum1
449 FMADD y01, atemp1, a1, y01
450 LFD a1, 4 * SIZE(AO1)
452 FMADD xsum2, xtemp1, a5, xsum2
454 FMADD y02, atemp1, a2, y02
457 FMADD xsum3, xtemp1, a9, xsum3
459 FMADD y03, atemp1, a3, y03
462 FMADD xsum4, xtemp1, a13, xsum4
463 LFD xtemp1, 4 * SIZE(XX)
464 FMADD y04, atemp1, a4, y04
467 FMADD xsum1, xtemp2, a2, xsum1
468 LFD a2, 5 * SIZE(AO1)
469 FMADD y01, atemp2, a5, y01
470 LFD a5, 4 * SIZE(AO2)
472 FMADD xsum2, xtemp2, a6, xsum2
474 FMADD y02, atemp2, a6, y02
475 LFD a6, 5 * SIZE(AO2)
477 FMADD xsum3, xtemp2, a10, xsum3
479 FMADD y03, atemp2, a7, y03
482 FMADD xsum4, xtemp2, a14, xsum4
483 LFD xtemp2, 5 * SIZE(XX)
484 FMADD y04, atemp2, a8, y04
488 FMADD xsum1, xtemp3, a3, xsum1
489 LFD a3, 6 * SIZE(AO1)
490 FMADD y01, atemp3, a9, y01
491 LFD a9, 4 * SIZE(AO3)
493 FMADD xsum2, xtemp3, a7, xsum2
494 LFD a7, 6 * SIZE(AO2)
495 FMADD y02, atemp3, a10, y02
496 LFD a10, 5 * SIZE(AO3)
498 FMADD xsum3, xtemp3, a11, xsum3
500 FMADD y03, atemp3, a11, y03
501 LFD a11, 6 * SIZE(AO3)
503 FMADD xsum4, xtemp3, a15, xsum4
504 LFD xtemp3, 6 * SIZE(XX)
505 FMADD y04, atemp3, a12, y04
508 FMADD xsum1, xtemp4, a4, xsum1
509 LFD a4, 7 * SIZE(AO1)
510 FMADD y01, atemp4, a13, y01
511 LFD a13, 4 * SIZE(AO4)
513 FMADD xsum2, xtemp4, a8, xsum2
514 LFD a8, 7 * SIZE(AO2)
515 FMADD y02, atemp4, a14, y02
516 LFD a14, 5 * SIZE(AO4)
518 FMADD xsum3, xtemp4, a12, xsum3
519 LFD a12, 7 * SIZE(AO3)
520 FMADD y03, atemp4, a15, y03
521 LFD a15, 6 * SIZE(AO4)
523 FMADD xsum4, xtemp4, a16, xsum4
524 LFD xtemp4, 7 * SIZE(XX)
525 FMADD y04, atemp4, a16, y04
526 LFD a16, 7 * SIZE(AO4)
528 STFD y01, 0 * SIZE(YY)
529 LFD y01, 4 * SIZE(YY)
530 STFD y02, 1 * SIZE(YY)
531 LFD y02, 5 * SIZE(YY)
533 STFD y03, 2 * SIZE(YY)
534 LFD y03, 6 * SIZE(YY)
535 STFD y04, 3 * SIZE(YY)
536 LFD y04, 7 * SIZE(YY)
538 FMADD xsum1, xtemp1, a1, xsum1
540 FMADD y01, atemp1, a1, y01
541 LFD a1, 8 * SIZE(AO1)
543 FMADD xsum2, xtemp1, a5, xsum2
545 FMADD y02, atemp1, a2, y02
548 FMADD xsum3, xtemp1, a9, xsum3
550 FMADD y03, atemp1, a3, y03
553 FMADD xsum4, xtemp1, a13, xsum4
554 LFD xtemp1, 8 * SIZE(XX)
555 FMADD y04, atemp1, a4, y04
558 FMADD xsum1, xtemp2, a2, xsum1
559 LFD a2, 9 * SIZE(AO1)
560 FMADD y01, atemp2, a5, y01
561 LFD a5, 8 * SIZE(AO2)
563 FMADD xsum2, xtemp2, a6, xsum2
565 FMADD y02, atemp2, a6, y02
566 LFD a6, 9 * SIZE(AO2)
568 FMADD xsum3, xtemp2, a10, xsum3
570 FMADD y03, atemp2, a7, y03
573 FMADD xsum4, xtemp2, a14, xsum4
574 LFD xtemp2, 9 * SIZE(XX)
575 FMADD y04, atemp2, a8, y04
578 FMADD xsum1, xtemp3, a3, xsum1
579 LFD a3, 10 * SIZE(AO1)
580 FMADD y01, atemp3, a9, y01
581 LFD a9, 8 * SIZE(AO3)
583 FMADD xsum2, xtemp3, a7, xsum2
584 LFD a7, 10 * SIZE(AO2)
585 FMADD y02, atemp3, a10, y02
586 LFD a10, 9 * SIZE(AO3)
588 FMADD xsum3, xtemp3, a11, xsum3
590 FMADD y03, atemp3, a11, y03
591 LFD a11, 10 * SIZE(AO3)
593 FMADD xsum4, xtemp3, a15, xsum4
594 LFD xtemp3, 10 * SIZE(XX)
595 FMADD y04, atemp3, a12, y04
598 FMADD xsum1, xtemp4, a4, xsum1
599 LFD a4, 11 * SIZE(AO1)
600 FMADD y01, atemp4, a13, y01
601 LFD a13, 8 * SIZE(AO4)
603 FMADD xsum2, xtemp4, a8, xsum2
604 LFD a8, 11 * SIZE(AO2)
605 FMADD y02, atemp4, a14, y02
606 LFD a14, 9 * SIZE(AO4)
608 FMADD xsum3, xtemp4, a12, xsum3
609 LFD a12, 11 * SIZE(AO3)
610 FMADD y03, atemp4, a15, y03
611 LFD a15, 10 * SIZE(AO4)
613 FMADD xsum4, xtemp4, a16, xsum4
614 LFD xtemp4, 11 * SIZE(XX)
615 FMADD y04, atemp4, a16, y04
616 LFD a16, 11 * SIZE(AO4)
618 STFD y01, 4 * SIZE(YY)
619 LFD y01, 8 * SIZE(YY)
620 STFD y02, 5 * SIZE(YY)
621 LFD y02, 9 * SIZE(YY)
623 STFD y03, 6 * SIZE(YY)
624 LFD y03, 10 * SIZE(YY)
625 STFD y04, 7 * SIZE(YY)
626 LFD y04, 11 * SIZE(YY)
629 FMADD xsum1, xtemp1, a1, xsum1
631 FMADD y01, atemp1, a1, y01
632 LFD a1, 12 * SIZE(AO1)
634 FMADD xsum2, xtemp1, a5, xsum2
636 FMADD y02, atemp1, a2, y02
639 FMADD xsum3, xtemp1, a9, xsum3
641 FMADD y03, atemp1, a3, y03
644 FMADD xsum4, xtemp1, a13, xsum4
645 LFD xtemp1, 12 * SIZE(XX)
646 FMADD y04, atemp1, a4, y04
649 FMADD xsum1, xtemp2, a2, xsum1
650 LFD a2, 13 * SIZE(AO1)
651 FMADD y01, atemp2, a5, y01
652 LFD a5, 12 * SIZE(AO2)
654 FMADD xsum2, xtemp2, a6, xsum2
656 FMADD y02, atemp2, a6, y02
657 LFD a6, 13 * SIZE(AO2)
659 FMADD xsum3, xtemp2, a10, xsum3
661 FMADD y03, atemp2, a7, y03
665 FMADD xsum4, xtemp2, a14, xsum4
666 LFD xtemp2, 13 * SIZE(XX)
667 FMADD y04, atemp2, a8, y04
670 FMADD xsum1, xtemp3, a3, xsum1
671 LFD a3, 14 * SIZE(AO1)
672 FMADD y01, atemp3, a9, y01
673 LFD a9, 12 * SIZE(AO3)
675 FMADD xsum2, xtemp3, a7, xsum2
676 LFD a7, 14 * SIZE(AO2)
677 FMADD y02, atemp3, a10, y02
678 LFD a10,13 * SIZE(AO3)
680 FMADD xsum3, xtemp3, a11, xsum3
682 FMADD y03, atemp3, a11, y03
683 LFD a11, 14 * SIZE(AO3)
685 FMADD xsum4, xtemp3, a15, xsum4
686 LFD xtemp3, 14 * SIZE(XX)
687 FMADD y04, atemp3, a12, y04
690 FMADD xsum1, xtemp4, a4, xsum1
691 LFD a4, 15 * SIZE(AO1)
692 FMADD y01, atemp4, a13, y01
693 LFD a13,12 * SIZE(AO4)
695 FMADD xsum2, xtemp4, a8, xsum2
696 LFD a8, 15 * SIZE(AO2)
697 FMADD y02, atemp4, a14, y02
698 LFD a14, 13 * SIZE(AO4)
700 FMADD xsum3, xtemp4, a12, xsum3
701 LFD a12, 15 * SIZE(AO3)
702 FMADD y03, atemp4, a15, y03
703 LFD a15, 14 * SIZE(AO4)
705 FMADD xsum4, xtemp4, a16, xsum4
706 LFD xtemp4, 15 * SIZE(XX)
707 FMADD y04, atemp4, a16, y04
708 LFD a16, 15 * SIZE(AO4)
710 STFD y01, 8 * SIZE(YY)
711 LFD y01, 12 * SIZE(YY)
712 STFD y02, 9 * SIZE(YY)
713 LFD y02, 13 * SIZE(YY)
715 STFD y03, 10 * SIZE(YY)
716 LFD y03, 14 * SIZE(YY)
717 STFD y04, 11 * SIZE(YY)
718 LFD y04, 15 * SIZE(YY)
720 FMADD xsum1, xtemp1, a1, xsum1
722 FMADD y01, atemp1, a1, y01
723 LFD a1, 16 * SIZE(AO1)
725 FMADD xsum2, xtemp1, a5, xsum2
727 FMADD y02, atemp1, a2, y02
730 FMADD xsum3, xtemp1, a9, xsum3
732 FMADD y03, atemp1, a3, y03
735 FMADD xsum4, xtemp1, a13, xsum4
736 LFD xtemp1, 16 * SIZE(XX)
737 FMADD y04, atemp1, a4, y04
738 addi YY, YY, 16 * SIZE
740 FMADD xsum1, xtemp2, a2, xsum1
741 LFD a2, 17 * SIZE(AO1)
742 FMADD y01, atemp2, a5, y01
743 LFD a5, 16 * SIZE(AO2)
745 FMADD xsum2, xtemp2, a6, xsum2
746 addi AO3, AO3, 16 * SIZE
747 FMADD y02, atemp2, a6, y02
748 LFD a6, 17 * SIZE(AO2)
750 FMADD xsum3, xtemp2, a10, xsum3
751 addi AO1, AO1, 16 * SIZE
752 FMADD y03, atemp2, a7, y03
753 addi AO2, AO2, 16 * SIZE
755 FMADD xsum4, xtemp2, a14, xsum4
756 LFD xtemp2, 17 * SIZE(XX)
757 FMADD y04, atemp2, a8, y04
758 addi AO4, AO4, 16 * SIZE
760 FMADD xsum1, xtemp3, a3, xsum1
761 LFD a3, 2 * SIZE(AO1)
762 FMADD y01, atemp3, a9, y01
763 LFD a9, 0 * SIZE(AO3)
765 FMADD xsum2, xtemp3, a7, xsum2
766 LFD a7, 2 * SIZE(AO2)
767 FMADD y02, atemp3, a10, y02
768 LFD a10, 1 * SIZE(AO3)
770 FMADD xsum3, xtemp3, a11, xsum3
772 FMADD y03, atemp3, a11, y03
773 LFD a11, 2 * SIZE(AO3)
775 FMADD xsum4, xtemp3, a15, xsum4
776 LFD xtemp3, 18 * SIZE(XX)
777 FMADD y04, atemp3, a12, y04
778 addi XX, XX, 16 * SIZE
780 FMADD xsum1, xtemp4, a4, xsum1
781 LFD a4, 3 * SIZE(AO1)
782 FMADD y01, atemp4, a13, y01
783 LFD a13, 0 * SIZE(AO4)
785 FMADD xsum2, xtemp4, a8, xsum2
786 LFD a8, 3 * SIZE(AO2)
787 FMADD y02, atemp4, a14, y02
788 LFD a14, 1 * SIZE(AO4)
790 FMADD xsum3, xtemp4, a12, xsum3
791 LFD a12, 3 * SIZE(AO3)
792 FMADD y03, atemp4, a15, y03
793 LFD a15, 2 * SIZE(AO4)
795 FMADD xsum4, xtemp4, a16, xsum4
796 LFD xtemp4, 3 * SIZE(XX)
797 FMADD y04, atemp4, a16, y04
798 LFD a16, 3 * SIZE(AO4)
800 STFD y01, -4 * SIZE(YY)
801 LFD y01, 0 * SIZE(YY)
802 STFD y02, -3 * SIZE(YY)
803 LFD y02, 1 * SIZE(YY)
805 STFD y03, -2 * SIZE(YY)
806 LFD y03, 2 * SIZE(YY)
807 STFD y04, -1 * SIZE(YY)
808 LFD y04, 3 * SIZE(YY)
816 FMADD xsum1, xtemp1, a1, xsum1
818 FMADD y01, atemp1, a1, y01
819 LFD a1, 4 * SIZE(AO1)
821 FMADD xsum2, xtemp1, a5, xsum2
823 FMADD y02, atemp1, a2, y02
826 FMADD xsum3, xtemp1, a9, xsum3
828 FMADD y03, atemp1, a3, y03
831 FMADD xsum4, xtemp1, a13, xsum4
832 LFD xtemp1, 4 * SIZE(XX)
833 FMADD y04, atemp1, a4, y04
836 FMADD xsum1, xtemp2, a2, xsum1
837 LFD a2, 5 * SIZE(AO1)
838 FMADD y01, atemp2, a5, y01
839 LFD a5, 4 * SIZE(AO2)
841 FMADD xsum2, xtemp2, a6, xsum2
843 FMADD y02, atemp2, a6, y02
844 LFD a6, 5 * SIZE(AO2)
846 FMADD xsum3, xtemp2, a10, xsum3
848 FMADD y03, atemp2, a7, y03
851 FMADD xsum4, xtemp2, a14, xsum4
852 LFD xtemp2, 5 * SIZE(XX)
853 FMADD y04, atemp2, a8, y04
856 FMADD xsum1, xtemp3, a3, xsum1
857 LFD a3, 6 * SIZE(AO1)
858 FMADD y01, atemp3, a9, y01
859 LFD a9, 4 * SIZE(AO3)
861 FMADD xsum2, xtemp3, a7, xsum2
862 LFD a7, 6 * SIZE(AO2)
863 FMADD y02, atemp3, a10, y02
864 LFD a10, 5 * SIZE(AO3)
866 FMADD xsum3, xtemp3, a11, xsum3
868 FMADD y03, atemp3, a11, y03
869 LFD a11, 6 * SIZE(AO3)
871 FMADD xsum4, xtemp3, a15, xsum4
872 LFD xtemp3, 6 * SIZE(XX)
873 FMADD y04, atemp3, a12, y04
876 FMADD xsum1, xtemp4, a4, xsum1
877 LFD a4, 7 * SIZE(AO1)
878 FMADD y01, atemp4, a13, y01
879 LFD a13, 4 * SIZE(AO4)
881 FMADD xsum2, xtemp4, a8, xsum2
882 LFD a8, 7 * SIZE(AO2)
883 FMADD y02, atemp4, a14, y02
884 LFD a14, 5 * SIZE(AO4)
886 FMADD xsum3, xtemp4, a12, xsum3
887 LFD a12, 7 * SIZE(AO3)
888 FMADD y03, atemp4, a15, y03
889 LFD a15, 6 * SIZE(AO4)
891 FMADD xsum4, xtemp4, a16, xsum4
892 LFD xtemp4, 7 * SIZE(XX)
893 FMADD y04, atemp4, a16, y04
894 LFD a16, 7 * SIZE(AO4)
896 STFD y01, 0 * SIZE(YY)
897 LFD y01, 4 * SIZE(YY)
898 STFD y02, 1 * SIZE(YY)
899 LFD y02, 5 * SIZE(YY)
901 STFD y03, 2 * SIZE(YY)
902 LFD y03, 6 * SIZE(YY)
903 STFD y04, 3 * SIZE(YY)
904 LFD y04, 7 * SIZE(YY)
906 FMADD xsum1, xtemp1, a1, xsum1
908 FMADD y01, atemp1, a1, y01
909 LFD a1, 8 * SIZE(AO1)
911 FMADD xsum2, xtemp1, a5, xsum2
913 FMADD y02, atemp1, a2, y02
916 FMADD xsum3, xtemp1, a9, xsum3
918 FMADD y03, atemp1, a3, y03
921 FMADD xsum4, xtemp1, a13, xsum4
922 LFD xtemp1, 8 * SIZE(XX)
923 FMADD y04, atemp1, a4, y04
926 FMADD xsum1, xtemp2, a2, xsum1
927 LFD a2, 9 * SIZE(AO1)
928 FMADD y01, atemp2, a5, y01
929 LFD a5, 8 * SIZE(AO2)
931 FMADD xsum2, xtemp2, a6, xsum2
933 FMADD y02, atemp2, a6, y02
934 LFD a6, 9 * SIZE(AO2)
936 FMADD xsum3, xtemp2, a10, xsum3
938 FMADD y03, atemp2, a7, y03
941 FMADD xsum4, xtemp2, a14, xsum4
942 LFD xtemp2, 9 * SIZE(XX)
943 FMADD y04, atemp2, a8, y04
946 FMADD xsum1, xtemp3, a3, xsum1
947 LFD a3, 10 * SIZE(AO1)
948 FMADD y01, atemp3, a9, y01
949 LFD a9, 8 * SIZE(AO3)
951 FMADD xsum2, xtemp3, a7, xsum2
952 LFD a7, 10 * SIZE(AO2)
953 FMADD y02, atemp3, a10, y02
954 LFD a10, 9 * SIZE(AO3)
956 FMADD xsum3, xtemp3, a11, xsum3
958 FMADD y03, atemp3, a11, y03
959 LFD a11, 10 * SIZE(AO3)
961 FMADD xsum4, xtemp3, a15, xsum4
962 LFD xtemp3, 10 * SIZE(XX)
963 FMADD y04, atemp3, a12, y04
966 FMADD xsum1, xtemp4, a4, xsum1
967 LFD a4, 11 * SIZE(AO1)
968 FMADD y01, atemp4, a13, y01
969 LFD a13, 8 * SIZE(AO4)
971 FMADD xsum2, xtemp4, a8, xsum2
972 LFD a8, 11 * SIZE(AO2)
973 FMADD y02, atemp4, a14, y02
974 LFD a14, 9 * SIZE(AO4)
976 FMADD xsum3, xtemp4, a12, xsum3
977 LFD a12, 11 * SIZE(AO3)
978 FMADD y03, atemp4, a15, y03
979 LFD a15, 10 * SIZE(AO4)
981 FMADD xsum4, xtemp4, a16, xsum4
982 LFD xtemp4, 11 * SIZE(XX)
983 FMADD y04, atemp4, a16, y04
984 LFD a16, 11 * SIZE(AO4)
986 addi AO1, AO1, 8 * SIZE
987 addi AO2, AO2, 8 * SIZE
988 addi AO3, AO3, 8 * SIZE
989 addi AO4, AO4, 8 * SIZE
991 STFD y01, 4 * SIZE(YY)
992 LFD y01, 8 * SIZE(YY)
993 STFD y02, 5 * SIZE(YY)
994 LFD y02, 9 * SIZE(YY)
996 STFD y03, 6 * SIZE(YY)
997 LFD y03, 10 * SIZE(YY)
998 STFD y04, 7 * SIZE(YY)
999 LFD y04, 11 * SIZE(YY)
1001 addi XX, XX, 8 * SIZE
1002 addi YY, YY, 8 * SIZE
1009 FMADD xsum1, xtemp1, a1, xsum1
1011 FMADD y01, atemp1, a1, y01
1012 LFD a1, 4 * SIZE(AO1)
1014 FMADD xsum2, xtemp1, a5, xsum2
1016 FMADD y02, atemp1, a2, y02
1019 FMADD xsum3, xtemp1, a9, xsum3
1021 FMADD y03, atemp1, a3, y03
1024 FMADD xsum4, xtemp1, a13, xsum4
1025 LFD xtemp1, 4 * SIZE(XX)
1026 FMADD y04, atemp1, a4, y04
1029 FMADD xsum1, xtemp2, a2, xsum1
1030 LFD a2, 5 * SIZE(AO1)
1031 FMADD y01, atemp2, a5, y01
1032 LFD a5, 4 * SIZE(AO2)
1034 FMADD xsum2, xtemp2, a6, xsum2
1036 FMADD y02, atemp2, a6, y02
1037 LFD a6, 5 * SIZE(AO2)
1039 FMADD xsum3, xtemp2, a10, xsum3
1041 FMADD y03, atemp2, a7, y03
1044 FMADD xsum4, xtemp2, a14, xsum4
1045 LFD xtemp2, 5 * SIZE(XX)
1046 FMADD y04, atemp2, a8, y04
1049 FMADD xsum1, xtemp3, a3, xsum1
1050 LFD a3, 6 * SIZE(AO1)
1051 FMADD y01, atemp3, a9, y01
1052 LFD a9, 4 * SIZE(AO3)
1054 FMADD xsum2, xtemp3, a7, xsum2
1055 LFD a7, 6 * SIZE(AO2)
1056 FMADD y02, atemp3, a10, y02
1057 LFD a10, 5 * SIZE(AO3)
1059 FMADD xsum3, xtemp3, a11, xsum3
1061 FMADD y03, atemp3, a11, y03
1062 LFD a11, 6 * SIZE(AO3)
1064 FMADD xsum4, xtemp3, a15, xsum4
1065 LFD xtemp3, 6 * SIZE(XX)
1066 FMADD y04, atemp3, a12, y04
1069 FMADD xsum1, xtemp4, a4, xsum1
1070 LFD a4, 7 * SIZE(AO1)
1071 FMADD y01, atemp4, a13, y01
1072 LFD a13, 4 * SIZE(AO4)
1074 FMADD xsum2, xtemp4, a8, xsum2
1075 LFD a8, 7 * SIZE(AO2)
1076 FMADD y02, atemp4, a14, y02
1077 LFD a14, 5 * SIZE(AO4)
1079 FMADD xsum3, xtemp4, a12, xsum3
1080 LFD a12, 7 * SIZE(AO3)
1081 FMADD y03, atemp4, a15, y03
1082 LFD a15, 6 * SIZE(AO4)
1084 FMADD xsum4, xtemp4, a16, xsum4
1085 LFD xtemp4, 7 * SIZE(XX)
1086 FMADD y04, atemp4, a16, y04
1087 LFD a16, 7 * SIZE(AO4)
1089 addi AO1, AO1, 4 * SIZE
1090 addi AO2, AO2, 4 * SIZE
1091 addi AO3, AO3, 4 * SIZE
1092 addi AO4, AO4, 4 * SIZE
1094 STFD y01, 0 * SIZE(YY)
1095 LFD y01, 4 * SIZE(YY)
1096 STFD y02, 1 * SIZE(YY)
1097 LFD y02, 5 * SIZE(YY)
1099 STFD y03, 2 * SIZE(YY)
1100 LFD y03, 6 * SIZE(YY)
1101 STFD y04, 3 * SIZE(YY)
1102 LFD y04, 7 * SIZE(YY)
1104 addi XX, XX, 4 * SIZE
1105 addi YY, YY, 4 * SIZE
1111 FMUL xsum1, xtemp1, xsum1
1112 FMUL xsum2, xtemp1, xsum2
1113 FMUL xsum3, xtemp1, xsum3
1114 FMUL xsum4, xtemp1, xsum4
1116 FMADD xsum1, atemp1, a1, xsum1
1117 FMADD xsum2, atemp1, a5, xsum2
1118 FMADD xsum3, atemp1, a9, xsum3
1119 FMADD xsum4, atemp1, a13, xsum4
1121 FMADD xsum1, atemp2, a5, xsum1
1122 FMADD xsum2, atemp2, a6, xsum2
1123 FMADD xsum3, atemp2, a10, xsum3
1124 FMADD xsum4, atemp2, a14, xsum4
1126 FMADD xsum1, atemp3, a9, xsum1
1127 FMADD xsum2, atemp3, a10, xsum2
1128 FMADD xsum3, atemp3, a11, xsum3
1129 FMADD xsum4, atemp3, a15, xsum4
1131 FMADD xsum1, atemp4, a13, xsum1
1132 FMADD xsum2, atemp4, a14, xsum2
1133 FMADD xsum3, atemp4, a15, xsum3
1134 FMADD xsum4, atemp4, a16, xsum4
1136 FADD y01, y01, xsum1
1137 FADD y02, y02, xsum2
1138 FADD y03, y03, xsum3
1139 FADD y04, y04, xsum4
1141 STFD y01, 0 * SIZE(YY)
1142 STFD y02, 1 * SIZE(YY)
1143 STFD y03, 2 * SIZE(YY)
1144 STFD y04, 3 * SIZE(YY)
1160 slwi TEMP, IS, BASE_SHIFT
1163 LFD atemp1, 0 * SIZE(TEMP)
1164 LFD atemp2, 1 * SIZE(TEMP)
1168 FMUL atemp1, a1, atemp1
1169 FMUL atemp2, a1, atemp2
1177 LFD xtemp1, 0 * SIZE(XX)
1178 LFD xtemp2, 1 * SIZE(XX)
1180 LFD y01, 0 * SIZE(YY)
1181 LFD y02, 1 * SIZE(YY)
1183 LFD a1, 0 * SIZE(AO1)
1184 LFD a2, 1 * SIZE(AO1)
1186 LFD a5, 0 * SIZE(AO2)
1187 LFD a6, 1 * SIZE(AO2)
1195 FMADD xsum1, xtemp1, a1, xsum1
1196 FMADD xsum2, xtemp1, a5, xsum2
1198 FMADD xsum1, xtemp2, a2, xsum1
1199 FMADD xsum2, xtemp2, a6, xsum2
1201 FMADD y01, atemp1, a1, y01
1202 FMADD y02, atemp1, a2, y02
1203 FMADD y01, atemp2, a5, y01
1204 FMADD y02, atemp2, a6, y02
1206 LFD xtemp1, 2 * SIZE(XX)
1207 LFD xtemp2, 3 * SIZE(XX)
1209 LFD a1, 2 * SIZE(AO1)
1210 LFD a2, 3 * SIZE(AO1)
1212 LFD a5, 2 * SIZE(AO2)
1213 LFD a6, 3 * SIZE(AO2)
1215 STFD y01, 0 * SIZE(YY)
1216 STFD y02, 1 * SIZE(YY)
1218 LFD y01, 2 * SIZE(YY)
1219 LFD y02, 3 * SIZE(YY)
1221 addi AO1, AO1, 2 * SIZE
1222 addi AO2, AO2, 2 * SIZE
1224 addi XX, XX, 2 * SIZE
1225 addi YY, YY, 2 * SIZE
1233 FMUL xsum1, xtemp1, xsum1
1234 FMUL xsum2, xtemp1, xsum2
1236 FMADD xsum1, atemp1, a1, xsum1
1237 FMADD xsum2, atemp1, a5, xsum2
1238 FMADD xsum1, atemp2, a5, xsum1
1239 FMADD xsum2, atemp2, a6, xsum2
1241 FADD y01, y01, xsum1
1242 FADD y02, y02, xsum2
1244 STFD y01, 0 * SIZE(YY)
1245 STFD y02, 1 * SIZE(YY)
1256 slwi TEMP, IS, BASE_SHIFT
1259 LFD atemp1, 0 * SIZE(TEMP)
1263 FMUL atemp1, a1, atemp1
1270 LFD xtemp1, 0 * SIZE(XX)
1271 LFD y01, 0 * SIZE(YY)
1273 LFD a1, 0 * SIZE(AO1)
1281 FMADD xsum1, xtemp1, a1, xsum1
1283 FMADD y01, atemp1, a1, y01
1285 LFD xtemp1, 1 * SIZE(XX)
1287 LFD a1, 1 * SIZE(AO1)
1289 STFD y01, 0 * SIZE(YY)
1291 LFD y01, 1 * SIZE(YY)
1293 addi AO1, AO1, 1 * SIZE
1295 addi XX, XX, 1 * SIZE
1296 addi YY, YY, 1 * SIZE
1304 FMUL xsum1, xtemp1, xsum1
1306 FMADD xsum1, atemp1, a1, xsum1
1308 FADD y01, y01, xsum1
1310 STFD y01, 0 * SIZE(YY)
1314 cmpwi cr0, INCY, SIZE
1342 LFD f8, 0 * SIZE(NEW_Y)
1343 LFD f9, 1 * SIZE(NEW_Y)
1344 LFD f10, 2 * SIZE(NEW_Y)
1345 LFD f11, 3 * SIZE(NEW_Y)
1346 LFD f12, 4 * SIZE(NEW_Y)
1347 LFD f13, 5 * SIZE(NEW_Y)
1348 LFD f14, 6 * SIZE(NEW_Y)
1349 LFD f15, 7 * SIZE(NEW_Y)
1350 addi NEW_Y, NEW_Y, 8 * SIZE
1361 STFD f8, 0 * SIZE(YY)
1363 STFD f9, 0 * SIZE(YY)
1365 STFD f10, 0 * SIZE(YY)
1367 STFD f11, 0 * SIZE(YY)
1369 STFD f12, 0 * SIZE(YY)
1371 STFD f13, 0 * SIZE(YY)
1373 STFD f14, 0 * SIZE(YY)
1375 STFD f15, 0 * SIZE(YY)
1393 LFD f8, 0 * SIZE(NEW_Y)
1394 LFD f9, 1 * SIZE(NEW_Y)
1395 LFD f10, 2 * SIZE(NEW_Y)
1396 LFD f11, 3 * SIZE(NEW_Y)
1397 addi NEW_Y, NEW_Y, 4 * SIZE
1404 STFD f8, 0 * SIZE(YY)
1406 STFD f9, 0 * SIZE(YY)
1408 STFD f10, 0 * SIZE(YY)
1410 STFD f11, 0 * SIZE(YY)
1423 LFD f8, 0 * SIZE(NEW_Y)
1424 LFD f9, 1 * SIZE(NEW_Y)
1425 addi NEW_Y, NEW_Y, 2 * SIZE
1430 STFD f8, 0 * SIZE(YY)
1432 STFD f9, 0 * SIZE(YY)
1441 LFD f8, 0 * SIZE(NEW_Y)
1445 STFD f8, 0 * SIZE(YY)
1502 addi SP, SP, STACKSIZE