1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
66 #if defined(_AIX) || defined(__APPLE__)
67 #if !defined(__64BIT__) && defined(DOUBLE)
144 #define PREFETCHSIZE_A 24
147 #if defined(PPC440) || defined(PPC440FP2)
148 #define PREFETCHSIZE_A 24
152 #define PREFETCHSIZE_A 64
156 #define PREFETCHSIZE_A 72
160 #define PREFETCHSIZE_A 16
164 #define PREFETCHSIZE_A 96
168 #define PREFETCHSIZE_A 40
171 #if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
175 #define NOP1 mr LDA, LDA
176 #define NOP2 mr INCX, INCX
182 #define STACKSIZE 224
183 #define ALPHA 200(SP)
184 #define FZERO 208(SP)
186 #define STACKSIZE 280
187 #define ALPHA 256(SP)
188 #define FZERO 264(SP)
194 addi SP, SP, -STACKSIZE
253 lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
255 ld INCY, FRAMESLOT(0) + STACKSIZE(SP)
256 ld BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
260 #if defined(_AIX) || defined(__APPLE__)
263 lwz Y, FRAMESLOT(0) + STACKSIZE(SP)
264 lwz INCY, FRAMESLOT(1) + STACKSIZE(SP)
265 lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
267 lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
268 lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
271 ld INCY, FRAMESLOT(0) + STACKSIZE(SP)
272 ld BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
278 slwi LDA, LDA, BASE_SHIFT
279 slwi INCX, INCX, BASE_SHIFT
280 slwi INCY, INCY, BASE_SHIFT
282 li PREA, PREFETCHSIZE_A * SIZE
287 cmpwi cr0, INCX, SIZE
319 STFD a1, 0 * SIZE(BUFFER)
320 STFD a2, 1 * SIZE(BUFFER)
321 STFD a3, 2 * SIZE(BUFFER)
322 STFD a4, 3 * SIZE(BUFFER)
323 STFD a5, 4 * SIZE(BUFFER)
324 STFD a6, 5 * SIZE(BUFFER)
325 STFD a7, 6 * SIZE(BUFFER)
326 STFD a8, 7 * SIZE(BUFFER)
328 addi BUFFER, BUFFER, 8 * SIZE
342 STFD a1, 0 * SIZE(BUFFER)
343 addi BUFFER, BUFFER, 1 * SIZE
351 cmpwi cr0, INCY, SIZE
362 STFD f0, 0 * SIZE(BUFFER)
363 STFD f0, 1 * SIZE(BUFFER)
364 STFD f0, 2 * SIZE(BUFFER)
365 STFD f0, 3 * SIZE(BUFFER)
366 STFD f0, 4 * SIZE(BUFFER)
367 STFD f0, 5 * SIZE(BUFFER)
368 STFD f0, 6 * SIZE(BUFFER)
369 STFD f0, 7 * SIZE(BUFFER)
370 addi BUFFER, BUFFER, 8 * SIZE
389 slwi TEMP, IS, BASE_SHIFT
393 LFD atemp1, 0 * SIZE(XX)
394 LFD atemp2, 1 * SIZE(XX)
395 LFD atemp3, 2 * SIZE(XX)
396 LFD atemp4, 3 * SIZE(XX)
398 LFD a1, 0 * SIZE(AO1)
399 LFD a2, 1 * SIZE(AO1)
400 LFD a3, 2 * SIZE(AO1)
401 LFD a4, 3 * SIZE(AO1)
403 LFD a6, 1 * SIZE(AO2)
404 LFD a7, 2 * SIZE(AO2)
405 LFD a8, 3 * SIZE(AO2)
407 LFD a11, 2 * SIZE(AO3)
408 LFD a12, 3 * SIZE(AO3)
410 LFD a16, 3 * SIZE(AO4)
414 FMUL xsum1, atemp1, a1
415 FMUL xsum2, atemp1, a2
416 FMUL xsum3, atemp1, a3
417 FMUL xsum4, atemp1, a4
419 FMADD xsum1, atemp2, a2, xsum1
420 FMADD xsum2, atemp2, a6, xsum2
421 FMADD xsum3, atemp2, a7, xsum3
422 FMADD xsum4, atemp2, a8, xsum4
424 FMADD xsum1, atemp3, a3, xsum1
425 FMADD xsum2, atemp3, a7, xsum2
426 FMADD xsum3, atemp3, a11, xsum3
427 FMADD xsum4, atemp3, a12, xsum4
429 FMADD xsum1, atemp4, a4, xsum1
430 FMADD xsum2, atemp4, a8, xsum2
431 FMADD xsum3, atemp4, a12, xsum3
432 FMADD xsum4, atemp4, a16, xsum4
434 FMUL atemp1, a5, atemp1
435 FMUL atemp2, a5, atemp2
436 FMUL atemp3, a5, atemp3
437 FMUL atemp4, a5, atemp4
439 LFD xtemp1, 4 * SIZE(XX)
440 LFD xtemp2, 5 * SIZE(XX)
441 LFD xtemp3, 6 * SIZE(XX)
442 LFD xtemp4, 7 * SIZE(XX)
444 LFD y01, 4 * SIZE(YY)
445 LFD y02, 5 * SIZE(YY)
446 LFD y03, 6 * SIZE(YY)
447 LFD y04, 7 * SIZE(YY)
449 LFD a1, 4 * SIZE(AO1)
450 LFD a2, 5 * SIZE(AO1)
451 LFD a3, 6 * SIZE(AO1)
452 LFD a4, 7 * SIZE(AO1)
454 LFD a5, 4 * SIZE(AO2)
455 LFD a6, 5 * SIZE(AO2)
456 LFD a7, 6 * SIZE(AO2)
457 LFD a8, 7 * SIZE(AO2)
459 LFD a9, 4 * SIZE(AO3)
460 LFD a10, 5 * SIZE(AO3)
461 LFD a11, 6 * SIZE(AO3)
462 LFD a12, 7 * SIZE(AO3)
464 LFD a13, 4 * SIZE(AO4)
465 LFD a14, 5 * SIZE(AO4)
466 LFD a15, 6 * SIZE(AO4)
467 LFD a16, 7 * SIZE(AO4)
469 addi AO1, AO1, 4 * SIZE
470 addi AO2, AO2, 4 * SIZE
471 addi AO3, AO3, 4 * SIZE
472 addi AO4, AO4, 4 * SIZE
474 addi XX, XX, 4 * SIZE
475 addi YY, YY, 4 * SIZE
485 FMADD xsum1, xtemp1, a1, xsum1
487 FMADD y01, atemp1, a1, y01
488 LFD a1, 4 * SIZE(AO1)
490 FMADD xsum2, xtemp1, a5, xsum2
492 FMADD y02, atemp1, a2, y02
495 FMADD xsum3, xtemp1, a9, xsum3
497 FMADD y03, atemp1, a3, y03
500 FMADD xsum4, xtemp1, a13, xsum4
501 LFD xtemp1, 4 * SIZE(XX)
502 FMADD y04, atemp1, a4, y04
505 FMADD xsum1, xtemp2, a2, xsum1
506 LFD a2, 5 * SIZE(AO1)
507 FMADD y01, atemp2, a5, y01
508 LFD a5, 4 * SIZE(AO2)
510 FMADD xsum2, xtemp2, a6, xsum2
512 FMADD y02, atemp2, a6, y02
513 LFD a6, 5 * SIZE(AO2)
515 FMADD xsum3, xtemp2, a10, xsum3
517 FMADD y03, atemp2, a7, y03
520 FMADD xsum4, xtemp2, a14, xsum4
521 LFD xtemp2, 5 * SIZE(XX)
522 FMADD y04, atemp2, a8, y04
526 FMADD xsum1, xtemp3, a3, xsum1
527 LFD a3, 6 * SIZE(AO1)
528 FMADD y01, atemp3, a9, y01
529 LFD a9, 4 * SIZE(AO3)
531 FMADD xsum2, xtemp3, a7, xsum2
532 LFD a7, 6 * SIZE(AO2)
533 FMADD y02, atemp3, a10, y02
534 LFD a10, 5 * SIZE(AO3)
536 FMADD xsum3, xtemp3, a11, xsum3
538 FMADD y03, atemp3, a11, y03
539 LFD a11, 6 * SIZE(AO3)
541 FMADD xsum4, xtemp3, a15, xsum4
542 LFD xtemp3, 6 * SIZE(XX)
543 FMADD y04, atemp3, a12, y04
546 FMADD xsum1, xtemp4, a4, xsum1
547 LFD a4, 7 * SIZE(AO1)
548 FMADD y01, atemp4, a13, y01
549 LFD a13, 4 * SIZE(AO4)
551 FMADD xsum2, xtemp4, a8, xsum2
552 LFD a8, 7 * SIZE(AO2)
553 FMADD y02, atemp4, a14, y02
554 LFD a14, 5 * SIZE(AO4)
556 FMADD xsum3, xtemp4, a12, xsum3
557 LFD a12, 7 * SIZE(AO3)
558 FMADD y03, atemp4, a15, y03
559 LFD a15, 6 * SIZE(AO4)
561 FMADD xsum4, xtemp4, a16, xsum4
562 LFD xtemp4, 7 * SIZE(XX)
563 FMADD y04, atemp4, a16, y04
564 LFD a16, 7 * SIZE(AO4)
566 STFD y01, 0 * SIZE(YY)
567 LFD y01, 4 * SIZE(YY)
568 STFD y02, 1 * SIZE(YY)
569 LFD y02, 5 * SIZE(YY)
571 STFD y03, 2 * SIZE(YY)
572 LFD y03, 6 * SIZE(YY)
573 STFD y04, 3 * SIZE(YY)
574 LFD y04, 7 * SIZE(YY)
576 FMADD xsum1, xtemp1, a1, xsum1
578 FMADD y01, atemp1, a1, y01
579 LFD a1, 8 * SIZE(AO1)
581 FMADD xsum2, xtemp1, a5, xsum2
583 FMADD y02, atemp1, a2, y02
586 FMADD xsum3, xtemp1, a9, xsum3
588 FMADD y03, atemp1, a3, y03
591 FMADD xsum4, xtemp1, a13, xsum4
592 LFD xtemp1, 8 * SIZE(XX)
593 FMADD y04, atemp1, a4, y04
596 FMADD xsum1, xtemp2, a2, xsum1
597 LFD a2, 9 * SIZE(AO1)
598 FMADD y01, atemp2, a5, y01
599 LFD a5, 8 * SIZE(AO2)
601 FMADD xsum2, xtemp2, a6, xsum2
603 FMADD y02, atemp2, a6, y02
604 LFD a6, 9 * SIZE(AO2)
606 FMADD xsum3, xtemp2, a10, xsum3
608 FMADD y03, atemp2, a7, y03
611 FMADD xsum4, xtemp2, a14, xsum4
612 LFD xtemp2, 9 * SIZE(XX)
613 FMADD y04, atemp2, a8, y04
616 FMADD xsum1, xtemp3, a3, xsum1
617 LFD a3, 10 * SIZE(AO1)
618 FMADD y01, atemp3, a9, y01
619 LFD a9, 8 * SIZE(AO3)
621 FMADD xsum2, xtemp3, a7, xsum2
622 LFD a7, 10 * SIZE(AO2)
623 FMADD y02, atemp3, a10, y02
624 LFD a10, 9 * SIZE(AO3)
626 FMADD xsum3, xtemp3, a11, xsum3
628 FMADD y03, atemp3, a11, y03
629 LFD a11, 10 * SIZE(AO3)
631 FMADD xsum4, xtemp3, a15, xsum4
632 LFD xtemp3, 10 * SIZE(XX)
633 FMADD y04, atemp3, a12, y04
636 FMADD xsum1, xtemp4, a4, xsum1
637 LFD a4, 11 * SIZE(AO1)
638 FMADD y01, atemp4, a13, y01
639 LFD a13, 8 * SIZE(AO4)
641 FMADD xsum2, xtemp4, a8, xsum2
642 LFD a8, 11 * SIZE(AO2)
643 FMADD y02, atemp4, a14, y02
644 LFD a14, 9 * SIZE(AO4)
646 FMADD xsum3, xtemp4, a12, xsum3
647 LFD a12, 11 * SIZE(AO3)
648 FMADD y03, atemp4, a15, y03
649 LFD a15, 10 * SIZE(AO4)
651 FMADD xsum4, xtemp4, a16, xsum4
652 LFD xtemp4, 11 * SIZE(XX)
653 FMADD y04, atemp4, a16, y04
654 LFD a16, 11 * SIZE(AO4)
656 STFD y01, 4 * SIZE(YY)
657 LFD y01, 8 * SIZE(YY)
658 STFD y02, 5 * SIZE(YY)
659 LFD y02, 9 * SIZE(YY)
661 STFD y03, 6 * SIZE(YY)
662 LFD y03, 10 * SIZE(YY)
663 STFD y04, 7 * SIZE(YY)
664 LFD y04, 11 * SIZE(YY)
667 FMADD xsum1, xtemp1, a1, xsum1
669 FMADD y01, atemp1, a1, y01
670 LFD a1, 12 * SIZE(AO1)
672 FMADD xsum2, xtemp1, a5, xsum2
674 FMADD y02, atemp1, a2, y02
677 FMADD xsum3, xtemp1, a9, xsum3
679 FMADD y03, atemp1, a3, y03
682 FMADD xsum4, xtemp1, a13, xsum4
683 LFD xtemp1, 12 * SIZE(XX)
684 FMADD y04, atemp1, a4, y04
687 FMADD xsum1, xtemp2, a2, xsum1
688 LFD a2, 13 * SIZE(AO1)
689 FMADD y01, atemp2, a5, y01
690 LFD a5, 12 * SIZE(AO2)
692 FMADD xsum2, xtemp2, a6, xsum2
694 FMADD y02, atemp2, a6, y02
695 LFD a6, 13 * SIZE(AO2)
697 FMADD xsum3, xtemp2, a10, xsum3
699 FMADD y03, atemp2, a7, y03
703 FMADD xsum4, xtemp2, a14, xsum4
704 LFD xtemp2, 13 * SIZE(XX)
705 FMADD y04, atemp2, a8, y04
708 FMADD xsum1, xtemp3, a3, xsum1
709 LFD a3, 14 * SIZE(AO1)
710 FMADD y01, atemp3, a9, y01
711 LFD a9, 12 * SIZE(AO3)
713 FMADD xsum2, xtemp3, a7, xsum2
714 LFD a7, 14 * SIZE(AO2)
715 FMADD y02, atemp3, a10, y02
716 LFD a10,13 * SIZE(AO3)
718 FMADD xsum3, xtemp3, a11, xsum3
720 FMADD y03, atemp3, a11, y03
721 LFD a11, 14 * SIZE(AO3)
723 FMADD xsum4, xtemp3, a15, xsum4
724 LFD xtemp3, 14 * SIZE(XX)
725 FMADD y04, atemp3, a12, y04
728 FMADD xsum1, xtemp4, a4, xsum1
729 LFD a4, 15 * SIZE(AO1)
730 FMADD y01, atemp4, a13, y01
731 LFD a13,12 * SIZE(AO4)
733 FMADD xsum2, xtemp4, a8, xsum2
734 LFD a8, 15 * SIZE(AO2)
735 FMADD y02, atemp4, a14, y02
736 LFD a14, 13 * SIZE(AO4)
738 FMADD xsum3, xtemp4, a12, xsum3
739 LFD a12, 15 * SIZE(AO3)
740 FMADD y03, atemp4, a15, y03
741 LFD a15, 14 * SIZE(AO4)
743 FMADD xsum4, xtemp4, a16, xsum4
744 LFD xtemp4, 15 * SIZE(XX)
745 FMADD y04, atemp4, a16, y04
746 LFD a16, 15 * SIZE(AO4)
748 STFD y01, 8 * SIZE(YY)
749 LFD y01, 12 * SIZE(YY)
750 STFD y02, 9 * SIZE(YY)
751 LFD y02, 13 * SIZE(YY)
753 STFD y03, 10 * SIZE(YY)
754 LFD y03, 14 * SIZE(YY)
755 STFD y04, 11 * SIZE(YY)
756 LFD y04, 15 * SIZE(YY)
758 FMADD xsum1, xtemp1, a1, xsum1
760 FMADD y01, atemp1, a1, y01
761 LFD a1, 16 * SIZE(AO1)
763 FMADD xsum2, xtemp1, a5, xsum2
765 FMADD y02, atemp1, a2, y02
768 FMADD xsum3, xtemp1, a9, xsum3
770 FMADD y03, atemp1, a3, y03
773 FMADD xsum4, xtemp1, a13, xsum4
774 LFD xtemp1, 16 * SIZE(XX)
775 FMADD y04, atemp1, a4, y04
776 addi YY, YY, 16 * SIZE
778 FMADD xsum1, xtemp2, a2, xsum1
779 LFD a2, 17 * SIZE(AO1)
780 FMADD y01, atemp2, a5, y01
781 LFD a5, 16 * SIZE(AO2)
783 FMADD xsum2, xtemp2, a6, xsum2
784 addi AO3, AO3, 16 * SIZE
785 FMADD y02, atemp2, a6, y02
786 LFD a6, 17 * SIZE(AO2)
788 FMADD xsum3, xtemp2, a10, xsum3
789 addi AO1, AO1, 16 * SIZE
790 FMADD y03, atemp2, a7, y03
791 addi AO2, AO2, 16 * SIZE
793 FMADD xsum4, xtemp2, a14, xsum4
794 LFD xtemp2, 17 * SIZE(XX)
795 FMADD y04, atemp2, a8, y04
796 addi AO4, AO4, 16 * SIZE
798 FMADD xsum1, xtemp3, a3, xsum1
799 LFD a3, 2 * SIZE(AO1)
800 FMADD y01, atemp3, a9, y01
801 LFD a9, 0 * SIZE(AO3)
803 FMADD xsum2, xtemp3, a7, xsum2
804 LFD a7, 2 * SIZE(AO2)
805 FMADD y02, atemp3, a10, y02
806 LFD a10, 1 * SIZE(AO3)
808 FMADD xsum3, xtemp3, a11, xsum3
810 FMADD y03, atemp3, a11, y03
811 LFD a11, 2 * SIZE(AO3)
813 FMADD xsum4, xtemp3, a15, xsum4
814 LFD xtemp3, 18 * SIZE(XX)
815 FMADD y04, atemp3, a12, y04
816 addi XX, XX, 16 * SIZE
818 FMADD xsum1, xtemp4, a4, xsum1
819 LFD a4, 3 * SIZE(AO1)
820 FMADD y01, atemp4, a13, y01
821 LFD a13, 0 * SIZE(AO4)
823 FMADD xsum2, xtemp4, a8, xsum2
824 LFD a8, 3 * SIZE(AO2)
825 FMADD y02, atemp4, a14, y02
826 LFD a14, 1 * SIZE(AO4)
828 FMADD xsum3, xtemp4, a12, xsum3
829 LFD a12, 3 * SIZE(AO3)
830 FMADD y03, atemp4, a15, y03
831 LFD a15, 2 * SIZE(AO4)
833 FMADD xsum4, xtemp4, a16, xsum4
834 LFD xtemp4, 3 * SIZE(XX)
835 FMADD y04, atemp4, a16, y04
836 LFD a16, 3 * SIZE(AO4)
838 STFD y01, -4 * SIZE(YY)
839 LFD y01, 0 * SIZE(YY)
840 STFD y02, -3 * SIZE(YY)
841 LFD y02, 1 * SIZE(YY)
843 STFD y03, -2 * SIZE(YY)
844 LFD y03, 2 * SIZE(YY)
845 STFD y04, -1 * SIZE(YY)
846 LFD y04, 3 * SIZE(YY)
856 FMADD xsum1, xtemp1, a1, xsum1
858 FMADD y01, atemp1, a1, y01
859 LFD a1, 4 * SIZE(AO1)
861 FMADD xsum2, xtemp1, a5, xsum2
863 FMADD y02, atemp1, a2, y02
866 FMADD xsum3, xtemp1, a9, xsum3
868 FMADD y03, atemp1, a3, y03
871 FMADD xsum4, xtemp1, a13, xsum4
872 LFD xtemp1, 4 * SIZE(XX)
873 FMADD y04, atemp1, a4, y04
876 FMADD xsum1, xtemp2, a2, xsum1
877 LFD a2, 5 * SIZE(AO1)
878 FMADD y01, atemp2, a5, y01
879 LFD a5, 4 * SIZE(AO2)
881 FMADD xsum2, xtemp2, a6, xsum2
883 FMADD y02, atemp2, a6, y02
884 LFD a6, 5 * SIZE(AO2)
886 FMADD xsum3, xtemp2, a10, xsum3
888 FMADD y03, atemp2, a7, y03
891 FMADD xsum4, xtemp2, a14, xsum4
892 LFD xtemp2, 5 * SIZE(XX)
893 FMADD y04, atemp2, a8, y04
896 FMADD xsum1, xtemp3, a3, xsum1
897 LFD a3, 6 * SIZE(AO1)
898 FMADD y01, atemp3, a9, y01
899 LFD a9, 4 * SIZE(AO3)
901 FMADD xsum2, xtemp3, a7, xsum2
902 LFD a7, 6 * SIZE(AO2)
903 FMADD y02, atemp3, a10, y02
904 LFD a10, 5 * SIZE(AO3)
906 FMADD xsum3, xtemp3, a11, xsum3
908 FMADD y03, atemp3, a11, y03
909 LFD a11, 6 * SIZE(AO3)
911 FMADD xsum4, xtemp3, a15, xsum4
912 LFD xtemp3, 6 * SIZE(XX)
913 FMADD y04, atemp3, a12, y04
916 FMADD xsum1, xtemp4, a4, xsum1
917 LFD a4, 7 * SIZE(AO1)
918 FMADD y01, atemp4, a13, y01
919 LFD a13, 4 * SIZE(AO4)
921 FMADD xsum2, xtemp4, a8, xsum2
922 LFD a8, 7 * SIZE(AO2)
923 FMADD y02, atemp4, a14, y02
924 LFD a14, 5 * SIZE(AO4)
926 FMADD xsum3, xtemp4, a12, xsum3
927 LFD a12, 7 * SIZE(AO3)
928 FMADD y03, atemp4, a15, y03
929 LFD a15, 6 * SIZE(AO4)
931 FMADD xsum4, xtemp4, a16, xsum4
932 LFD xtemp4, 7 * SIZE(XX)
933 FMADD y04, atemp4, a16, y04
934 LFD a16, 7 * SIZE(AO4)
936 STFD y01, 0 * SIZE(YY)
937 LFD y01, 4 * SIZE(YY)
938 STFD y02, 1 * SIZE(YY)
939 LFD y02, 5 * SIZE(YY)
941 STFD y03, 2 * SIZE(YY)
942 LFD y03, 6 * SIZE(YY)
943 STFD y04, 3 * SIZE(YY)
944 LFD y04, 7 * SIZE(YY)
946 FMADD xsum1, xtemp1, a1, xsum1
948 FMADD y01, atemp1, a1, y01
949 LFD a1, 8 * SIZE(AO1)
951 FMADD xsum2, xtemp1, a5, xsum2
953 FMADD y02, atemp1, a2, y02
956 FMADD xsum3, xtemp1, a9, xsum3
958 FMADD y03, atemp1, a3, y03
961 FMADD xsum4, xtemp1, a13, xsum4
962 LFD xtemp1, 8 * SIZE(XX)
963 FMADD y04, atemp1, a4, y04
966 FMADD xsum1, xtemp2, a2, xsum1
967 LFD a2, 9 * SIZE(AO1)
968 FMADD y01, atemp2, a5, y01
969 LFD a5, 8 * SIZE(AO2)
971 FMADD xsum2, xtemp2, a6, xsum2
973 FMADD y02, atemp2, a6, y02
974 LFD a6, 9 * SIZE(AO2)
976 FMADD xsum3, xtemp2, a10, xsum3
978 FMADD y03, atemp2, a7, y03
981 FMADD xsum4, xtemp2, a14, xsum4
982 LFD xtemp2, 9 * SIZE(XX)
983 FMADD y04, atemp2, a8, y04
986 FMADD xsum1, xtemp3, a3, xsum1
987 LFD a3, 10 * SIZE(AO1)
988 FMADD y01, atemp3, a9, y01
989 LFD a9, 8 * SIZE(AO3)
991 FMADD xsum2, xtemp3, a7, xsum2
992 LFD a7, 10 * SIZE(AO2)
993 FMADD y02, atemp3, a10, y02
994 LFD a10, 9 * SIZE(AO3)
996 FMADD xsum3, xtemp3, a11, xsum3
998 FMADD y03, atemp3, a11, y03
999 LFD a11, 10 * SIZE(AO3)
1001 FMADD xsum4, xtemp3, a15, xsum4
1002 LFD xtemp3, 10 * SIZE(XX)
1003 FMADD y04, atemp3, a12, y04
1006 FMADD xsum1, xtemp4, a4, xsum1
1007 LFD a4, 11 * SIZE(AO1)
1008 FMADD y01, atemp4, a13, y01
1009 LFD a13, 8 * SIZE(AO4)
1011 FMADD xsum2, xtemp4, a8, xsum2
1012 LFD a8, 11 * SIZE(AO2)
1013 FMADD y02, atemp4, a14, y02
1014 LFD a14, 9 * SIZE(AO4)
1016 FMADD xsum3, xtemp4, a12, xsum3
1017 LFD a12, 11 * SIZE(AO3)
1018 FMADD y03, atemp4, a15, y03
1019 LFD a15, 10 * SIZE(AO4)
1021 FMADD xsum4, xtemp4, a16, xsum4
1022 LFD xtemp4, 11 * SIZE(XX)
1023 FMADD y04, atemp4, a16, y04
1024 LFD a16, 11 * SIZE(AO4)
1026 addi AO1, AO1, 8 * SIZE
1027 addi AO2, AO2, 8 * SIZE
1028 addi AO3, AO3, 8 * SIZE
1029 addi AO4, AO4, 8 * SIZE
1031 STFD y01, 4 * SIZE(YY)
1032 LFD y01, 8 * SIZE(YY)
1033 STFD y02, 5 * SIZE(YY)
1034 LFD y02, 9 * SIZE(YY)
1036 STFD y03, 6 * SIZE(YY)
1037 LFD y03, 10 * SIZE(YY)
1038 STFD y04, 7 * SIZE(YY)
1039 LFD y04, 11 * SIZE(YY)
1041 addi XX, XX, 8 * SIZE
1042 addi YY, YY, 8 * SIZE
1051 FMADD xsum1, xtemp1, a1, xsum1
1053 FMADD y01, atemp1, a1, y01
1054 LFD a1, 4 * SIZE(AO1)
1056 FMADD xsum2, xtemp1, a5, xsum2
1058 FMADD y02, atemp1, a2, y02
1061 FMADD xsum3, xtemp1, a9, xsum3
1063 FMADD y03, atemp1, a3, y03
1066 FMADD xsum4, xtemp1, a13, xsum4
1067 LFD xtemp1, 4 * SIZE(XX)
1068 FMADD y04, atemp1, a4, y04
1071 FMADD xsum1, xtemp2, a2, xsum1
1072 LFD a2, 5 * SIZE(AO1)
1073 FMADD y01, atemp2, a5, y01
1074 LFD a5, 4 * SIZE(AO2)
1076 FMADD xsum2, xtemp2, a6, xsum2
1078 FMADD y02, atemp2, a6, y02
1079 LFD a6, 5 * SIZE(AO2)
1081 FMADD xsum3, xtemp2, a10, xsum3
1083 FMADD y03, atemp2, a7, y03
1086 FMADD xsum4, xtemp2, a14, xsum4
1087 LFD xtemp2, 5 * SIZE(XX)
1088 FMADD y04, atemp2, a8, y04
1091 FMADD xsum1, xtemp3, a3, xsum1
1092 LFD a3, 6 * SIZE(AO1)
1093 FMADD y01, atemp3, a9, y01
1094 LFD a9, 4 * SIZE(AO3)
1096 FMADD xsum2, xtemp3, a7, xsum2
1097 LFD a7, 6 * SIZE(AO2)
1098 FMADD y02, atemp3, a10, y02
1099 LFD a10, 5 * SIZE(AO3)
1101 FMADD xsum3, xtemp3, a11, xsum3
1103 FMADD y03, atemp3, a11, y03
1104 LFD a11, 6 * SIZE(AO3)
1106 FMADD xsum4, xtemp3, a15, xsum4
1107 LFD xtemp3, 6 * SIZE(XX)
1108 FMADD y04, atemp3, a12, y04
1111 FMADD xsum1, xtemp4, a4, xsum1
1112 LFD a4, 7 * SIZE(AO1)
1113 FMADD y01, atemp4, a13, y01
1114 LFD a13, 4 * SIZE(AO4)
1116 FMADD xsum2, xtemp4, a8, xsum2
1117 LFD a8, 7 * SIZE(AO2)
1118 FMADD y02, atemp4, a14, y02
1119 LFD a14, 5 * SIZE(AO4)
1121 FMADD xsum3, xtemp4, a12, xsum3
1122 LFD a12, 7 * SIZE(AO3)
1123 FMADD y03, atemp4, a15, y03
1124 LFD a15, 6 * SIZE(AO4)
1126 FMADD xsum4, xtemp4, a16, xsum4
1127 LFD xtemp4, 7 * SIZE(XX)
1128 FMADD y04, atemp4, a16, y04
1129 LFD a16, 7 * SIZE(AO4)
1131 addi AO1, AO1, 4 * SIZE
1132 addi AO2, AO2, 4 * SIZE
1133 addi AO3, AO3, 4 * SIZE
1134 addi AO4, AO4, 4 * SIZE
1136 STFD y01, 0 * SIZE(YY)
1137 LFD y01, 4 * SIZE(YY)
1138 STFD y02, 1 * SIZE(YY)
1139 LFD y02, 5 * SIZE(YY)
1141 STFD y03, 2 * SIZE(YY)
1142 LFD y03, 6 * SIZE(YY)
1143 STFD y04, 3 * SIZE(YY)
1144 LFD y04, 7 * SIZE(YY)
1146 addi XX, XX, 4 * SIZE
1147 addi YY, YY, 4 * SIZE
1154 FMADD xsum1, xtemp1, a1, xsum1
1155 FMADD y01, atemp1, a1, y01
1156 LFD a1, 2 * SIZE(AO1)
1158 FMADD xsum2, xtemp1, a5, xsum2
1159 FMADD y02, atemp1, a2, y02
1161 FMADD xsum3, xtemp1, a9, xsum3
1162 FMADD y01, atemp2, a5, y01
1163 LFD a5, 2 * SIZE(AO2)
1165 FMADD xsum4, xtemp1, a13, xsum4
1166 LFD xtemp1, 2 * SIZE(XX)
1167 FMADD y02, atemp2, a6, y02
1169 FMADD xsum1, xtemp2, a2, xsum1
1170 FMADD y01, atemp3, a9, y01
1171 LFD a9, 2 * SIZE(AO3)
1173 FMADD xsum2, xtemp2, a6, xsum2
1174 FMADD y02, atemp3, a10, y02
1176 FMADD xsum3, xtemp2, a10, xsum3
1177 FMADD y01, atemp4, a13, y01
1178 LFD a13, 2 * SIZE(AO4)
1180 FMADD xsum4, xtemp2, a14, xsum4
1181 FMADD y02, atemp4, a14, y02
1183 STFD y01, 0 * SIZE(YY)
1184 LFD y01, 2 * SIZE(YY)
1185 STFD y02, 1 * SIZE(YY)
1186 addi YY, YY, 2 * SIZE
1193 FMADD xsum1, xtemp1, a1, xsum1
1194 FMADD y01, atemp1, a1, y01
1195 FMADD xsum2, xtemp1, a5, xsum2
1196 FMADD y01, atemp2, a5, y01
1197 FMADD xsum3, xtemp1, a9, xsum3
1198 FMADD y01, atemp3, a9, y01
1199 FMADD xsum4, xtemp1, a13, xsum4
1200 FMADD y01, atemp4, a13, y01
1202 STFD y01, 0 * SIZE(YY)
1206 slwi TEMP, IS, BASE_SHIFT
1209 LFD y01, 0 * SIZE(YY)
1210 LFD y02, 1 * SIZE(YY)
1211 LFD y03, 2 * SIZE(YY)
1212 LFD y04, 3 * SIZE(YY)
1216 FMUL xsum1, xtemp1, xsum1
1217 FMUL xsum2, xtemp1, xsum2
1218 FMUL xsum3, xtemp1, xsum3
1219 FMUL xsum4, xtemp1, xsum4
1221 FADD y01, y01, xsum1
1222 FADD y02, y02, xsum2
1223 FADD y03, y03, xsum3
1224 FADD y04, y04, xsum4
1226 STFD y01, 0 * SIZE(YY)
1227 STFD y02, 1 * SIZE(YY)
1228 STFD y03, 2 * SIZE(YY)
1229 STFD y04, 3 * SIZE(YY)
1246 slwi TEMP, IS, BASE_SHIFT
1250 LFD atemp1, 0 * SIZE(XX)
1251 LFD atemp2, 1 * SIZE(XX)
1253 LFD a1, 0 * SIZE(AO1)
1254 LFD a2, 1 * SIZE(AO1)
1255 LFD a6, 1 * SIZE(AO2)
1259 FMUL xsum1, atemp1, a1
1260 FMUL xsum2, atemp1, a2
1262 FMADD xsum1, atemp2, a2, xsum1
1263 FMADD xsum2, atemp2, a6, xsum2
1265 FMUL atemp1, a5, atemp1
1266 FMUL atemp2, a5, atemp2
1268 LFD xtemp1, 2 * SIZE(XX)
1269 LFD y01, 2 * SIZE(YY)
1270 LFD a1, 2 * SIZE(AO1)
1271 LFD a5, 2 * SIZE(AO2)
1276 FMADD xsum1, xtemp1, a1, xsum1
1277 FMADD y01, atemp1, a1, y01
1278 FMADD xsum2, xtemp1, a5, xsum2
1279 FMADD y01, atemp2, a5, y01
1281 STFD y01, 2 * SIZE(YY)
1285 slwi TEMP, IS, BASE_SHIFT
1288 LFD y01, 0 * SIZE(YY)
1289 LFD y02, 1 * SIZE(YY)
1293 FMUL xsum1, xtemp1, xsum1
1294 FMUL xsum2, xtemp1, xsum2
1296 FADD y01, y01, xsum1
1297 FADD y02, y02, xsum2
1299 STFD y01, 0 * SIZE(YY)
1300 STFD y02, 1 * SIZE(YY)
1311 slwi TEMP, IS, BASE_SHIFT
1315 LFD atemp1, 0 * SIZE(XX)
1316 LFD a1, 0 * SIZE(AO1)
1318 LFD y01, 0 * SIZE(YY)
1320 FMUL xsum1, atemp1, a1
1321 FMUL xsum1, xtemp1, xsum1
1323 FADD y01, y01, xsum1
1325 STFD y01, 0 * SIZE(YY)
1329 cmpwi cr0, INCY, SIZE
1357 LFD f8, 0 * SIZE(NEW_Y)
1358 LFD f9, 1 * SIZE(NEW_Y)
1359 LFD f10, 2 * SIZE(NEW_Y)
1360 LFD f11, 3 * SIZE(NEW_Y)
1361 LFD f12, 4 * SIZE(NEW_Y)
1362 LFD f13, 5 * SIZE(NEW_Y)
1363 LFD f14, 6 * SIZE(NEW_Y)
1364 LFD f15, 7 * SIZE(NEW_Y)
1365 addi NEW_Y, NEW_Y, 8 * SIZE
1376 STFD f8, 0 * SIZE(YY)
1378 STFD f9, 0 * SIZE(YY)
1380 STFD f10, 0 * SIZE(YY)
1382 STFD f11, 0 * SIZE(YY)
1384 STFD f12, 0 * SIZE(YY)
1386 STFD f13, 0 * SIZE(YY)
1388 STFD f14, 0 * SIZE(YY)
1390 STFD f15, 0 * SIZE(YY)
1408 LFD f8, 0 * SIZE(NEW_Y)
1409 LFD f9, 1 * SIZE(NEW_Y)
1410 LFD f10, 2 * SIZE(NEW_Y)
1411 LFD f11, 3 * SIZE(NEW_Y)
1412 addi NEW_Y, NEW_Y, 4 * SIZE
1419 STFD f8, 0 * SIZE(YY)
1421 STFD f9, 0 * SIZE(YY)
1423 STFD f10, 0 * SIZE(YY)
1425 STFD f11, 0 * SIZE(YY)
1438 LFD f8, 0 * SIZE(NEW_Y)
1439 LFD f9, 1 * SIZE(NEW_Y)
1440 addi NEW_Y, NEW_Y, 2 * SIZE
1445 STFD f8, 0 * SIZE(YY)
1447 STFD f9, 0 * SIZE(YY)
1456 LFD f8, 0 * SIZE(NEW_Y)
1460 STFD f8, 0 * SIZE(YY)
1517 addi SP, SP, STACKSIZE