1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
66 #if defined(_AIX) || defined(__APPLE__)
67 #if !defined(__64BIT__) && defined(DOUBLE)
144 #define PREFETCHSIZE_A 24
147 #if defined(PPC440) || defined(PPC440FP2)
148 #define PREFETCHSIZE_A 24
152 #define PREFETCHSIZE_A 64
156 #define PREFETCHSIZE_A 72
160 #define PREFETCHSIZE_A 16
164 #define PREFETCHSIZE_A 96
168 #define PREFETCHSIZE_A 40
172 #define PREFETCHSIZE_A 40
175 #if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8)
179 #define NOP1 mr LDA, LDA
180 #define NOP2 mr INCX, INCX
186 #define STACKSIZE 224
187 #define ALPHA 200(SP)
188 #define FZERO 208(SP)
190 #define STACKSIZE 280
191 #define ALPHA 256(SP)
192 #define FZERO 264(SP)
198 addi SP, SP, -STACKSIZE
257 lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
259 ld INCY, FRAMESLOT(0) + STACKSIZE(SP)
260 ld BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
264 #if defined(_AIX) || defined(__APPLE__)
267 lwz Y, FRAMESLOT(0) + STACKSIZE(SP)
268 lwz INCY, FRAMESLOT(1) + STACKSIZE(SP)
269 lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
271 lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
272 lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
275 ld INCY, FRAMESLOT(0) + STACKSIZE(SP)
276 ld BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
282 slwi LDA, LDA, BASE_SHIFT
283 slwi INCX, INCX, BASE_SHIFT
284 slwi INCY, INCY, BASE_SHIFT
286 li PREA, PREFETCHSIZE_A * SIZE
291 cmpwi cr0, INCX, SIZE
323 STFD a1, 0 * SIZE(BUFFER)
324 STFD a2, 1 * SIZE(BUFFER)
325 STFD a3, 2 * SIZE(BUFFER)
326 STFD a4, 3 * SIZE(BUFFER)
327 STFD a5, 4 * SIZE(BUFFER)
328 STFD a6, 5 * SIZE(BUFFER)
329 STFD a7, 6 * SIZE(BUFFER)
330 STFD a8, 7 * SIZE(BUFFER)
332 addi BUFFER, BUFFER, 8 * SIZE
346 STFD a1, 0 * SIZE(BUFFER)
347 addi BUFFER, BUFFER, 1 * SIZE
355 cmpwi cr0, INCY, SIZE
366 STFD f0, 0 * SIZE(BUFFER)
367 STFD f0, 1 * SIZE(BUFFER)
368 STFD f0, 2 * SIZE(BUFFER)
369 STFD f0, 3 * SIZE(BUFFER)
370 STFD f0, 4 * SIZE(BUFFER)
371 STFD f0, 5 * SIZE(BUFFER)
372 STFD f0, 6 * SIZE(BUFFER)
373 STFD f0, 7 * SIZE(BUFFER)
374 addi BUFFER, BUFFER, 8 * SIZE
393 slwi TEMP, IS, BASE_SHIFT
397 LFD atemp1, 0 * SIZE(XX)
398 LFD atemp2, 1 * SIZE(XX)
399 LFD atemp3, 2 * SIZE(XX)
400 LFD atemp4, 3 * SIZE(XX)
402 LFD a1, 0 * SIZE(AO1)
403 LFD a2, 1 * SIZE(AO1)
404 LFD a3, 2 * SIZE(AO1)
405 LFD a4, 3 * SIZE(AO1)
407 LFD a6, 1 * SIZE(AO2)
408 LFD a7, 2 * SIZE(AO2)
409 LFD a8, 3 * SIZE(AO2)
411 LFD a11, 2 * SIZE(AO3)
412 LFD a12, 3 * SIZE(AO3)
414 LFD a16, 3 * SIZE(AO4)
418 FMUL xsum1, atemp1, a1
419 FMUL xsum2, atemp1, a2
420 FMUL xsum3, atemp1, a3
421 FMUL xsum4, atemp1, a4
423 FMADD xsum1, atemp2, a2, xsum1
424 FMADD xsum2, atemp2, a6, xsum2
425 FMADD xsum3, atemp2, a7, xsum3
426 FMADD xsum4, atemp2, a8, xsum4
428 FMADD xsum1, atemp3, a3, xsum1
429 FMADD xsum2, atemp3, a7, xsum2
430 FMADD xsum3, atemp3, a11, xsum3
431 FMADD xsum4, atemp3, a12, xsum4
433 FMADD xsum1, atemp4, a4, xsum1
434 FMADD xsum2, atemp4, a8, xsum2
435 FMADD xsum3, atemp4, a12, xsum3
436 FMADD xsum4, atemp4, a16, xsum4
438 FMUL atemp1, a5, atemp1
439 FMUL atemp2, a5, atemp2
440 FMUL atemp3, a5, atemp3
441 FMUL atemp4, a5, atemp4
443 LFD xtemp1, 4 * SIZE(XX)
444 LFD xtemp2, 5 * SIZE(XX)
445 LFD xtemp3, 6 * SIZE(XX)
446 LFD xtemp4, 7 * SIZE(XX)
448 LFD y01, 4 * SIZE(YY)
449 LFD y02, 5 * SIZE(YY)
450 LFD y03, 6 * SIZE(YY)
451 LFD y04, 7 * SIZE(YY)
453 LFD a1, 4 * SIZE(AO1)
454 LFD a2, 5 * SIZE(AO1)
455 LFD a3, 6 * SIZE(AO1)
456 LFD a4, 7 * SIZE(AO1)
458 LFD a5, 4 * SIZE(AO2)
459 LFD a6, 5 * SIZE(AO2)
460 LFD a7, 6 * SIZE(AO2)
461 LFD a8, 7 * SIZE(AO2)
463 LFD a9, 4 * SIZE(AO3)
464 LFD a10, 5 * SIZE(AO3)
465 LFD a11, 6 * SIZE(AO3)
466 LFD a12, 7 * SIZE(AO3)
468 LFD a13, 4 * SIZE(AO4)
469 LFD a14, 5 * SIZE(AO4)
470 LFD a15, 6 * SIZE(AO4)
471 LFD a16, 7 * SIZE(AO4)
473 addi AO1, AO1, 4 * SIZE
474 addi AO2, AO2, 4 * SIZE
475 addi AO3, AO3, 4 * SIZE
476 addi AO4, AO4, 4 * SIZE
478 addi XX, XX, 4 * SIZE
479 addi YY, YY, 4 * SIZE
489 FMADD xsum1, xtemp1, a1, xsum1
491 FMADD y01, atemp1, a1, y01
492 LFD a1, 4 * SIZE(AO1)
494 FMADD xsum2, xtemp1, a5, xsum2
496 FMADD y02, atemp1, a2, y02
499 FMADD xsum3, xtemp1, a9, xsum3
501 FMADD y03, atemp1, a3, y03
504 FMADD xsum4, xtemp1, a13, xsum4
505 LFD xtemp1, 4 * SIZE(XX)
506 FMADD y04, atemp1, a4, y04
509 FMADD xsum1, xtemp2, a2, xsum1
510 LFD a2, 5 * SIZE(AO1)
511 FMADD y01, atemp2, a5, y01
512 LFD a5, 4 * SIZE(AO2)
514 FMADD xsum2, xtemp2, a6, xsum2
516 FMADD y02, atemp2, a6, y02
517 LFD a6, 5 * SIZE(AO2)
519 FMADD xsum3, xtemp2, a10, xsum3
521 FMADD y03, atemp2, a7, y03
524 FMADD xsum4, xtemp2, a14, xsum4
525 LFD xtemp2, 5 * SIZE(XX)
526 FMADD y04, atemp2, a8, y04
530 FMADD xsum1, xtemp3, a3, xsum1
531 LFD a3, 6 * SIZE(AO1)
532 FMADD y01, atemp3, a9, y01
533 LFD a9, 4 * SIZE(AO3)
535 FMADD xsum2, xtemp3, a7, xsum2
536 LFD a7, 6 * SIZE(AO2)
537 FMADD y02, atemp3, a10, y02
538 LFD a10, 5 * SIZE(AO3)
540 FMADD xsum3, xtemp3, a11, xsum3
542 FMADD y03, atemp3, a11, y03
543 LFD a11, 6 * SIZE(AO3)
545 FMADD xsum4, xtemp3, a15, xsum4
546 LFD xtemp3, 6 * SIZE(XX)
547 FMADD y04, atemp3, a12, y04
550 FMADD xsum1, xtemp4, a4, xsum1
551 LFD a4, 7 * SIZE(AO1)
552 FMADD y01, atemp4, a13, y01
553 LFD a13, 4 * SIZE(AO4)
555 FMADD xsum2, xtemp4, a8, xsum2
556 LFD a8, 7 * SIZE(AO2)
557 FMADD y02, atemp4, a14, y02
558 LFD a14, 5 * SIZE(AO4)
560 FMADD xsum3, xtemp4, a12, xsum3
561 LFD a12, 7 * SIZE(AO3)
562 FMADD y03, atemp4, a15, y03
563 LFD a15, 6 * SIZE(AO4)
565 FMADD xsum4, xtemp4, a16, xsum4
566 LFD xtemp4, 7 * SIZE(XX)
567 FMADD y04, atemp4, a16, y04
568 LFD a16, 7 * SIZE(AO4)
570 STFD y01, 0 * SIZE(YY)
571 LFD y01, 4 * SIZE(YY)
572 STFD y02, 1 * SIZE(YY)
573 LFD y02, 5 * SIZE(YY)
575 STFD y03, 2 * SIZE(YY)
576 LFD y03, 6 * SIZE(YY)
577 STFD y04, 3 * SIZE(YY)
578 LFD y04, 7 * SIZE(YY)
580 FMADD xsum1, xtemp1, a1, xsum1
582 FMADD y01, atemp1, a1, y01
583 LFD a1, 8 * SIZE(AO1)
585 FMADD xsum2, xtemp1, a5, xsum2
587 FMADD y02, atemp1, a2, y02
590 FMADD xsum3, xtemp1, a9, xsum3
592 FMADD y03, atemp1, a3, y03
595 FMADD xsum4, xtemp1, a13, xsum4
596 LFD xtemp1, 8 * SIZE(XX)
597 FMADD y04, atemp1, a4, y04
600 FMADD xsum1, xtemp2, a2, xsum1
601 LFD a2, 9 * SIZE(AO1)
602 FMADD y01, atemp2, a5, y01
603 LFD a5, 8 * SIZE(AO2)
605 FMADD xsum2, xtemp2, a6, xsum2
607 FMADD y02, atemp2, a6, y02
608 LFD a6, 9 * SIZE(AO2)
610 FMADD xsum3, xtemp2, a10, xsum3
612 FMADD y03, atemp2, a7, y03
615 FMADD xsum4, xtemp2, a14, xsum4
616 LFD xtemp2, 9 * SIZE(XX)
617 FMADD y04, atemp2, a8, y04
620 FMADD xsum1, xtemp3, a3, xsum1
621 LFD a3, 10 * SIZE(AO1)
622 FMADD y01, atemp3, a9, y01
623 LFD a9, 8 * SIZE(AO3)
625 FMADD xsum2, xtemp3, a7, xsum2
626 LFD a7, 10 * SIZE(AO2)
627 FMADD y02, atemp3, a10, y02
628 LFD a10, 9 * SIZE(AO3)
630 FMADD xsum3, xtemp3, a11, xsum3
632 FMADD y03, atemp3, a11, y03
633 LFD a11, 10 * SIZE(AO3)
635 FMADD xsum4, xtemp3, a15, xsum4
636 LFD xtemp3, 10 * SIZE(XX)
637 FMADD y04, atemp3, a12, y04
640 FMADD xsum1, xtemp4, a4, xsum1
641 LFD a4, 11 * SIZE(AO1)
642 FMADD y01, atemp4, a13, y01
643 LFD a13, 8 * SIZE(AO4)
645 FMADD xsum2, xtemp4, a8, xsum2
646 LFD a8, 11 * SIZE(AO2)
647 FMADD y02, atemp4, a14, y02
648 LFD a14, 9 * SIZE(AO4)
650 FMADD xsum3, xtemp4, a12, xsum3
651 LFD a12, 11 * SIZE(AO3)
652 FMADD y03, atemp4, a15, y03
653 LFD a15, 10 * SIZE(AO4)
655 FMADD xsum4, xtemp4, a16, xsum4
656 LFD xtemp4, 11 * SIZE(XX)
657 FMADD y04, atemp4, a16, y04
658 LFD a16, 11 * SIZE(AO4)
660 STFD y01, 4 * SIZE(YY)
661 LFD y01, 8 * SIZE(YY)
662 STFD y02, 5 * SIZE(YY)
663 LFD y02, 9 * SIZE(YY)
665 STFD y03, 6 * SIZE(YY)
666 LFD y03, 10 * SIZE(YY)
667 STFD y04, 7 * SIZE(YY)
668 LFD y04, 11 * SIZE(YY)
671 FMADD xsum1, xtemp1, a1, xsum1
673 FMADD y01, atemp1, a1, y01
674 LFD a1, 12 * SIZE(AO1)
676 FMADD xsum2, xtemp1, a5, xsum2
678 FMADD y02, atemp1, a2, y02
681 FMADD xsum3, xtemp1, a9, xsum3
683 FMADD y03, atemp1, a3, y03
686 FMADD xsum4, xtemp1, a13, xsum4
687 LFD xtemp1, 12 * SIZE(XX)
688 FMADD y04, atemp1, a4, y04
691 FMADD xsum1, xtemp2, a2, xsum1
692 LFD a2, 13 * SIZE(AO1)
693 FMADD y01, atemp2, a5, y01
694 LFD a5, 12 * SIZE(AO2)
696 FMADD xsum2, xtemp2, a6, xsum2
698 FMADD y02, atemp2, a6, y02
699 LFD a6, 13 * SIZE(AO2)
701 FMADD xsum3, xtemp2, a10, xsum3
703 FMADD y03, atemp2, a7, y03
707 FMADD xsum4, xtemp2, a14, xsum4
708 LFD xtemp2, 13 * SIZE(XX)
709 FMADD y04, atemp2, a8, y04
712 FMADD xsum1, xtemp3, a3, xsum1
713 LFD a3, 14 * SIZE(AO1)
714 FMADD y01, atemp3, a9, y01
715 LFD a9, 12 * SIZE(AO3)
717 FMADD xsum2, xtemp3, a7, xsum2
718 LFD a7, 14 * SIZE(AO2)
719 FMADD y02, atemp3, a10, y02
720 LFD a10,13 * SIZE(AO3)
722 FMADD xsum3, xtemp3, a11, xsum3
724 FMADD y03, atemp3, a11, y03
725 LFD a11, 14 * SIZE(AO3)
727 FMADD xsum4, xtemp3, a15, xsum4
728 LFD xtemp3, 14 * SIZE(XX)
729 FMADD y04, atemp3, a12, y04
732 FMADD xsum1, xtemp4, a4, xsum1
733 LFD a4, 15 * SIZE(AO1)
734 FMADD y01, atemp4, a13, y01
735 LFD a13,12 * SIZE(AO4)
737 FMADD xsum2, xtemp4, a8, xsum2
738 LFD a8, 15 * SIZE(AO2)
739 FMADD y02, atemp4, a14, y02
740 LFD a14, 13 * SIZE(AO4)
742 FMADD xsum3, xtemp4, a12, xsum3
743 LFD a12, 15 * SIZE(AO3)
744 FMADD y03, atemp4, a15, y03
745 LFD a15, 14 * SIZE(AO4)
747 FMADD xsum4, xtemp4, a16, xsum4
748 LFD xtemp4, 15 * SIZE(XX)
749 FMADD y04, atemp4, a16, y04
750 LFD a16, 15 * SIZE(AO4)
752 STFD y01, 8 * SIZE(YY)
753 LFD y01, 12 * SIZE(YY)
754 STFD y02, 9 * SIZE(YY)
755 LFD y02, 13 * SIZE(YY)
757 STFD y03, 10 * SIZE(YY)
758 LFD y03, 14 * SIZE(YY)
759 STFD y04, 11 * SIZE(YY)
760 LFD y04, 15 * SIZE(YY)
762 FMADD xsum1, xtemp1, a1, xsum1
764 FMADD y01, atemp1, a1, y01
765 LFD a1, 16 * SIZE(AO1)
767 FMADD xsum2, xtemp1, a5, xsum2
769 FMADD y02, atemp1, a2, y02
772 FMADD xsum3, xtemp1, a9, xsum3
774 FMADD y03, atemp1, a3, y03
777 FMADD xsum4, xtemp1, a13, xsum4
778 LFD xtemp1, 16 * SIZE(XX)
779 FMADD y04, atemp1, a4, y04
780 addi YY, YY, 16 * SIZE
782 FMADD xsum1, xtemp2, a2, xsum1
783 LFD a2, 17 * SIZE(AO1)
784 FMADD y01, atemp2, a5, y01
785 LFD a5, 16 * SIZE(AO2)
787 FMADD xsum2, xtemp2, a6, xsum2
788 addi AO3, AO3, 16 * SIZE
789 FMADD y02, atemp2, a6, y02
790 LFD a6, 17 * SIZE(AO2)
792 FMADD xsum3, xtemp2, a10, xsum3
793 addi AO1, AO1, 16 * SIZE
794 FMADD y03, atemp2, a7, y03
795 addi AO2, AO2, 16 * SIZE
797 FMADD xsum4, xtemp2, a14, xsum4
798 LFD xtemp2, 17 * SIZE(XX)
799 FMADD y04, atemp2, a8, y04
800 addi AO4, AO4, 16 * SIZE
802 FMADD xsum1, xtemp3, a3, xsum1
803 LFD a3, 2 * SIZE(AO1)
804 FMADD y01, atemp3, a9, y01
805 LFD a9, 0 * SIZE(AO3)
807 FMADD xsum2, xtemp3, a7, xsum2
808 LFD a7, 2 * SIZE(AO2)
809 FMADD y02, atemp3, a10, y02
810 LFD a10, 1 * SIZE(AO3)
812 FMADD xsum3, xtemp3, a11, xsum3
814 FMADD y03, atemp3, a11, y03
815 LFD a11, 2 * SIZE(AO3)
817 FMADD xsum4, xtemp3, a15, xsum4
818 LFD xtemp3, 18 * SIZE(XX)
819 FMADD y04, atemp3, a12, y04
820 addi XX, XX, 16 * SIZE
822 FMADD xsum1, xtemp4, a4, xsum1
823 LFD a4, 3 * SIZE(AO1)
824 FMADD y01, atemp4, a13, y01
825 LFD a13, 0 * SIZE(AO4)
827 FMADD xsum2, xtemp4, a8, xsum2
828 LFD a8, 3 * SIZE(AO2)
829 FMADD y02, atemp4, a14, y02
830 LFD a14, 1 * SIZE(AO4)
832 FMADD xsum3, xtemp4, a12, xsum3
833 LFD a12, 3 * SIZE(AO3)
834 FMADD y03, atemp4, a15, y03
835 LFD a15, 2 * SIZE(AO4)
837 FMADD xsum4, xtemp4, a16, xsum4
838 LFD xtemp4, 3 * SIZE(XX)
839 FMADD y04, atemp4, a16, y04
840 LFD a16, 3 * SIZE(AO4)
842 STFD y01, -4 * SIZE(YY)
843 LFD y01, 0 * SIZE(YY)
844 STFD y02, -3 * SIZE(YY)
845 LFD y02, 1 * SIZE(YY)
847 STFD y03, -2 * SIZE(YY)
848 LFD y03, 2 * SIZE(YY)
849 STFD y04, -1 * SIZE(YY)
850 LFD y04, 3 * SIZE(YY)
860 FMADD xsum1, xtemp1, a1, xsum1
862 FMADD y01, atemp1, a1, y01
863 LFD a1, 4 * SIZE(AO1)
865 FMADD xsum2, xtemp1, a5, xsum2
867 FMADD y02, atemp1, a2, y02
870 FMADD xsum3, xtemp1, a9, xsum3
872 FMADD y03, atemp1, a3, y03
875 FMADD xsum4, xtemp1, a13, xsum4
876 LFD xtemp1, 4 * SIZE(XX)
877 FMADD y04, atemp1, a4, y04
880 FMADD xsum1, xtemp2, a2, xsum1
881 LFD a2, 5 * SIZE(AO1)
882 FMADD y01, atemp2, a5, y01
883 LFD a5, 4 * SIZE(AO2)
885 FMADD xsum2, xtemp2, a6, xsum2
887 FMADD y02, atemp2, a6, y02
888 LFD a6, 5 * SIZE(AO2)
890 FMADD xsum3, xtemp2, a10, xsum3
892 FMADD y03, atemp2, a7, y03
895 FMADD xsum4, xtemp2, a14, xsum4
896 LFD xtemp2, 5 * SIZE(XX)
897 FMADD y04, atemp2, a8, y04
900 FMADD xsum1, xtemp3, a3, xsum1
901 LFD a3, 6 * SIZE(AO1)
902 FMADD y01, atemp3, a9, y01
903 LFD a9, 4 * SIZE(AO3)
905 FMADD xsum2, xtemp3, a7, xsum2
906 LFD a7, 6 * SIZE(AO2)
907 FMADD y02, atemp3, a10, y02
908 LFD a10, 5 * SIZE(AO3)
910 FMADD xsum3, xtemp3, a11, xsum3
912 FMADD y03, atemp3, a11, y03
913 LFD a11, 6 * SIZE(AO3)
915 FMADD xsum4, xtemp3, a15, xsum4
916 LFD xtemp3, 6 * SIZE(XX)
917 FMADD y04, atemp3, a12, y04
920 FMADD xsum1, xtemp4, a4, xsum1
921 LFD a4, 7 * SIZE(AO1)
922 FMADD y01, atemp4, a13, y01
923 LFD a13, 4 * SIZE(AO4)
925 FMADD xsum2, xtemp4, a8, xsum2
926 LFD a8, 7 * SIZE(AO2)
927 FMADD y02, atemp4, a14, y02
928 LFD a14, 5 * SIZE(AO4)
930 FMADD xsum3, xtemp4, a12, xsum3
931 LFD a12, 7 * SIZE(AO3)
932 FMADD y03, atemp4, a15, y03
933 LFD a15, 6 * SIZE(AO4)
935 FMADD xsum4, xtemp4, a16, xsum4
936 LFD xtemp4, 7 * SIZE(XX)
937 FMADD y04, atemp4, a16, y04
938 LFD a16, 7 * SIZE(AO4)
940 STFD y01, 0 * SIZE(YY)
941 LFD y01, 4 * SIZE(YY)
942 STFD y02, 1 * SIZE(YY)
943 LFD y02, 5 * SIZE(YY)
945 STFD y03, 2 * SIZE(YY)
946 LFD y03, 6 * SIZE(YY)
947 STFD y04, 3 * SIZE(YY)
948 LFD y04, 7 * SIZE(YY)
950 FMADD xsum1, xtemp1, a1, xsum1
952 FMADD y01, atemp1, a1, y01
953 LFD a1, 8 * SIZE(AO1)
955 FMADD xsum2, xtemp1, a5, xsum2
957 FMADD y02, atemp1, a2, y02
960 FMADD xsum3, xtemp1, a9, xsum3
962 FMADD y03, atemp1, a3, y03
965 FMADD xsum4, xtemp1, a13, xsum4
966 LFD xtemp1, 8 * SIZE(XX)
967 FMADD y04, atemp1, a4, y04
970 FMADD xsum1, xtemp2, a2, xsum1
971 LFD a2, 9 * SIZE(AO1)
972 FMADD y01, atemp2, a5, y01
973 LFD a5, 8 * SIZE(AO2)
975 FMADD xsum2, xtemp2, a6, xsum2
977 FMADD y02, atemp2, a6, y02
978 LFD a6, 9 * SIZE(AO2)
980 FMADD xsum3, xtemp2, a10, xsum3
982 FMADD y03, atemp2, a7, y03
985 FMADD xsum4, xtemp2, a14, xsum4
986 LFD xtemp2, 9 * SIZE(XX)
987 FMADD y04, atemp2, a8, y04
990 FMADD xsum1, xtemp3, a3, xsum1
991 LFD a3, 10 * SIZE(AO1)
992 FMADD y01, atemp3, a9, y01
993 LFD a9, 8 * SIZE(AO3)
995 FMADD xsum2, xtemp3, a7, xsum2
996 LFD a7, 10 * SIZE(AO2)
997 FMADD y02, atemp3, a10, y02
998 LFD a10, 9 * SIZE(AO3)
1000 FMADD xsum3, xtemp3, a11, xsum3
1002 FMADD y03, atemp3, a11, y03
1003 LFD a11, 10 * SIZE(AO3)
1005 FMADD xsum4, xtemp3, a15, xsum4
1006 LFD xtemp3, 10 * SIZE(XX)
1007 FMADD y04, atemp3, a12, y04
1010 FMADD xsum1, xtemp4, a4, xsum1
1011 LFD a4, 11 * SIZE(AO1)
1012 FMADD y01, atemp4, a13, y01
1013 LFD a13, 8 * SIZE(AO4)
1015 FMADD xsum2, xtemp4, a8, xsum2
1016 LFD a8, 11 * SIZE(AO2)
1017 FMADD y02, atemp4, a14, y02
1018 LFD a14, 9 * SIZE(AO4)
1020 FMADD xsum3, xtemp4, a12, xsum3
1021 LFD a12, 11 * SIZE(AO3)
1022 FMADD y03, atemp4, a15, y03
1023 LFD a15, 10 * SIZE(AO4)
1025 FMADD xsum4, xtemp4, a16, xsum4
1026 LFD xtemp4, 11 * SIZE(XX)
1027 FMADD y04, atemp4, a16, y04
1028 LFD a16, 11 * SIZE(AO4)
1030 addi AO1, AO1, 8 * SIZE
1031 addi AO2, AO2, 8 * SIZE
1032 addi AO3, AO3, 8 * SIZE
1033 addi AO4, AO4, 8 * SIZE
1035 STFD y01, 4 * SIZE(YY)
1036 LFD y01, 8 * SIZE(YY)
1037 STFD y02, 5 * SIZE(YY)
1038 LFD y02, 9 * SIZE(YY)
1040 STFD y03, 6 * SIZE(YY)
1041 LFD y03, 10 * SIZE(YY)
1042 STFD y04, 7 * SIZE(YY)
1043 LFD y04, 11 * SIZE(YY)
1045 addi XX, XX, 8 * SIZE
1046 addi YY, YY, 8 * SIZE
1055 FMADD xsum1, xtemp1, a1, xsum1
1057 FMADD y01, atemp1, a1, y01
1058 LFD a1, 4 * SIZE(AO1)
1060 FMADD xsum2, xtemp1, a5, xsum2
1062 FMADD y02, atemp1, a2, y02
1065 FMADD xsum3, xtemp1, a9, xsum3
1067 FMADD y03, atemp1, a3, y03
1070 FMADD xsum4, xtemp1, a13, xsum4
1071 LFD xtemp1, 4 * SIZE(XX)
1072 FMADD y04, atemp1, a4, y04
1075 FMADD xsum1, xtemp2, a2, xsum1
1076 LFD a2, 5 * SIZE(AO1)
1077 FMADD y01, atemp2, a5, y01
1078 LFD a5, 4 * SIZE(AO2)
1080 FMADD xsum2, xtemp2, a6, xsum2
1082 FMADD y02, atemp2, a6, y02
1083 LFD a6, 5 * SIZE(AO2)
1085 FMADD xsum3, xtemp2, a10, xsum3
1087 FMADD y03, atemp2, a7, y03
1090 FMADD xsum4, xtemp2, a14, xsum4
1091 LFD xtemp2, 5 * SIZE(XX)
1092 FMADD y04, atemp2, a8, y04
1095 FMADD xsum1, xtemp3, a3, xsum1
1096 LFD a3, 6 * SIZE(AO1)
1097 FMADD y01, atemp3, a9, y01
1098 LFD a9, 4 * SIZE(AO3)
1100 FMADD xsum2, xtemp3, a7, xsum2
1101 LFD a7, 6 * SIZE(AO2)
1102 FMADD y02, atemp3, a10, y02
1103 LFD a10, 5 * SIZE(AO3)
1105 FMADD xsum3, xtemp3, a11, xsum3
1107 FMADD y03, atemp3, a11, y03
1108 LFD a11, 6 * SIZE(AO3)
1110 FMADD xsum4, xtemp3, a15, xsum4
1111 LFD xtemp3, 6 * SIZE(XX)
1112 FMADD y04, atemp3, a12, y04
1115 FMADD xsum1, xtemp4, a4, xsum1
1116 LFD a4, 7 * SIZE(AO1)
1117 FMADD y01, atemp4, a13, y01
1118 LFD a13, 4 * SIZE(AO4)
1120 FMADD xsum2, xtemp4, a8, xsum2
1121 LFD a8, 7 * SIZE(AO2)
1122 FMADD y02, atemp4, a14, y02
1123 LFD a14, 5 * SIZE(AO4)
1125 FMADD xsum3, xtemp4, a12, xsum3
1126 LFD a12, 7 * SIZE(AO3)
1127 FMADD y03, atemp4, a15, y03
1128 LFD a15, 6 * SIZE(AO4)
1130 FMADD xsum4, xtemp4, a16, xsum4
1131 LFD xtemp4, 7 * SIZE(XX)
1132 FMADD y04, atemp4, a16, y04
1133 LFD a16, 7 * SIZE(AO4)
1135 addi AO1, AO1, 4 * SIZE
1136 addi AO2, AO2, 4 * SIZE
1137 addi AO3, AO3, 4 * SIZE
1138 addi AO4, AO4, 4 * SIZE
1140 STFD y01, 0 * SIZE(YY)
1141 LFD y01, 4 * SIZE(YY)
1142 STFD y02, 1 * SIZE(YY)
1143 LFD y02, 5 * SIZE(YY)
1145 STFD y03, 2 * SIZE(YY)
1146 LFD y03, 6 * SIZE(YY)
1147 STFD y04, 3 * SIZE(YY)
1148 LFD y04, 7 * SIZE(YY)
1150 addi XX, XX, 4 * SIZE
1151 addi YY, YY, 4 * SIZE
1158 FMADD xsum1, xtemp1, a1, xsum1
1159 FMADD y01, atemp1, a1, y01
1160 LFD a1, 2 * SIZE(AO1)
1162 FMADD xsum2, xtemp1, a5, xsum2
1163 FMADD y02, atemp1, a2, y02
1165 FMADD xsum3, xtemp1, a9, xsum3
1166 FMADD y01, atemp2, a5, y01
1167 LFD a5, 2 * SIZE(AO2)
1169 FMADD xsum4, xtemp1, a13, xsum4
1170 LFD xtemp1, 2 * SIZE(XX)
1171 FMADD y02, atemp2, a6, y02
1173 FMADD xsum1, xtemp2, a2, xsum1
1174 FMADD y01, atemp3, a9, y01
1175 LFD a9, 2 * SIZE(AO3)
1177 FMADD xsum2, xtemp2, a6, xsum2
1178 FMADD y02, atemp3, a10, y02
1180 FMADD xsum3, xtemp2, a10, xsum3
1181 FMADD y01, atemp4, a13, y01
1182 LFD a13, 2 * SIZE(AO4)
1184 FMADD xsum4, xtemp2, a14, xsum4
1185 FMADD y02, atemp4, a14, y02
1187 STFD y01, 0 * SIZE(YY)
1188 LFD y01, 2 * SIZE(YY)
1189 STFD y02, 1 * SIZE(YY)
1190 addi YY, YY, 2 * SIZE
1197 FMADD xsum1, xtemp1, a1, xsum1
1198 FMADD y01, atemp1, a1, y01
1199 FMADD xsum2, xtemp1, a5, xsum2
1200 FMADD y01, atemp2, a5, y01
1201 FMADD xsum3, xtemp1, a9, xsum3
1202 FMADD y01, atemp3, a9, y01
1203 FMADD xsum4, xtemp1, a13, xsum4
1204 FMADD y01, atemp4, a13, y01
1206 STFD y01, 0 * SIZE(YY)
1210 slwi TEMP, IS, BASE_SHIFT
1213 LFD y01, 0 * SIZE(YY)
1214 LFD y02, 1 * SIZE(YY)
1215 LFD y03, 2 * SIZE(YY)
1216 LFD y04, 3 * SIZE(YY)
1220 FMUL xsum1, xtemp1, xsum1
1221 FMUL xsum2, xtemp1, xsum2
1222 FMUL xsum3, xtemp1, xsum3
1223 FMUL xsum4, xtemp1, xsum4
1225 FADD y01, y01, xsum1
1226 FADD y02, y02, xsum2
1227 FADD y03, y03, xsum3
1228 FADD y04, y04, xsum4
1230 STFD y01, 0 * SIZE(YY)
1231 STFD y02, 1 * SIZE(YY)
1232 STFD y03, 2 * SIZE(YY)
1233 STFD y04, 3 * SIZE(YY)
1250 slwi TEMP, IS, BASE_SHIFT
1254 LFD atemp1, 0 * SIZE(XX)
1255 LFD atemp2, 1 * SIZE(XX)
1257 LFD a1, 0 * SIZE(AO1)
1258 LFD a2, 1 * SIZE(AO1)
1259 LFD a6, 1 * SIZE(AO2)
1263 FMUL xsum1, atemp1, a1
1264 FMUL xsum2, atemp1, a2
1266 FMADD xsum1, atemp2, a2, xsum1
1267 FMADD xsum2, atemp2, a6, xsum2
1269 FMUL atemp1, a5, atemp1
1270 FMUL atemp2, a5, atemp2
1272 LFD xtemp1, 2 * SIZE(XX)
1273 LFD y01, 2 * SIZE(YY)
1274 LFD a1, 2 * SIZE(AO1)
1275 LFD a5, 2 * SIZE(AO2)
1280 FMADD xsum1, xtemp1, a1, xsum1
1281 FMADD y01, atemp1, a1, y01
1282 FMADD xsum2, xtemp1, a5, xsum2
1283 FMADD y01, atemp2, a5, y01
1285 STFD y01, 2 * SIZE(YY)
1289 slwi TEMP, IS, BASE_SHIFT
1292 LFD y01, 0 * SIZE(YY)
1293 LFD y02, 1 * SIZE(YY)
1297 FMUL xsum1, xtemp1, xsum1
1298 FMUL xsum2, xtemp1, xsum2
1300 FADD y01, y01, xsum1
1301 FADD y02, y02, xsum2
1303 STFD y01, 0 * SIZE(YY)
1304 STFD y02, 1 * SIZE(YY)
1315 slwi TEMP, IS, BASE_SHIFT
1319 LFD atemp1, 0 * SIZE(XX)
1320 LFD a1, 0 * SIZE(AO1)
1322 LFD y01, 0 * SIZE(YY)
1324 FMUL xsum1, atemp1, a1
1325 FMUL xsum1, xtemp1, xsum1
1327 FADD y01, y01, xsum1
1329 STFD y01, 0 * SIZE(YY)
1333 cmpwi cr0, INCY, SIZE
1361 LFD f8, 0 * SIZE(NEW_Y)
1362 LFD f9, 1 * SIZE(NEW_Y)
1363 LFD f10, 2 * SIZE(NEW_Y)
1364 LFD f11, 3 * SIZE(NEW_Y)
1365 LFD f12, 4 * SIZE(NEW_Y)
1366 LFD f13, 5 * SIZE(NEW_Y)
1367 LFD f14, 6 * SIZE(NEW_Y)
1368 LFD f15, 7 * SIZE(NEW_Y)
1369 addi NEW_Y, NEW_Y, 8 * SIZE
1380 STFD f8, 0 * SIZE(YY)
1382 STFD f9, 0 * SIZE(YY)
1384 STFD f10, 0 * SIZE(YY)
1386 STFD f11, 0 * SIZE(YY)
1388 STFD f12, 0 * SIZE(YY)
1390 STFD f13, 0 * SIZE(YY)
1392 STFD f14, 0 * SIZE(YY)
1394 STFD f15, 0 * SIZE(YY)
1412 LFD f8, 0 * SIZE(NEW_Y)
1413 LFD f9, 1 * SIZE(NEW_Y)
1414 LFD f10, 2 * SIZE(NEW_Y)
1415 LFD f11, 3 * SIZE(NEW_Y)
1416 addi NEW_Y, NEW_Y, 4 * SIZE
1423 STFD f8, 0 * SIZE(YY)
1425 STFD f9, 0 * SIZE(YY)
1427 STFD f10, 0 * SIZE(YY)
1429 STFD f11, 0 * SIZE(YY)
1442 LFD f8, 0 * SIZE(NEW_Y)
1443 LFD f9, 1 * SIZE(NEW_Y)
1444 addi NEW_Y, NEW_Y, 2 * SIZE
1449 STFD f8, 0 * SIZE(YY)
1451 STFD f9, 0 * SIZE(YY)
1460 LFD f8, 0 * SIZE(NEW_Y)
1464 STFD f8, 0 * SIZE(YY)
1521 addi SP, SP, STACKSIZE