1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
66 #if defined(_AIX) || defined(__APPLE__)
67 #if !defined(__64BIT__) && defined(DOUBLE)
142 #define PREFETCHSIZE_A 24
145 #if defined(PPC440) || defined(PPC440FP2)
146 #define PREFETCHSIZE_A 24
150 #define PREFETCHSIZE_A 32
154 #define PREFETCHSIZE_A 72
158 #define PREFETCHSIZE_A 16
162 #define PREFETCHSIZE_A 96
166 #define PREFETCHSIZE_A 112
169 #if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
173 #define NOP1 mr LDA, LDA
174 #define NOP2 mr INCX, INCX
180 #define STACKSIZE 224
181 #define ALPHA_R 200(SP)
182 #define ALPHA_I 208(SP)
183 #define FZERO 216(SP)
185 #define STACKSIZE 280
186 #define ALPHA_R 256(SP)
187 #define ALPHA_I 264(SP)
188 #define FZERO 272(SP)
192 #define FMADD1 FNMSUB
196 #define FMADD2 FNMSUB
202 addi SP, SP, -STACKSIZE
261 lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
263 ld Y, FRAMESLOT(0) + STACKSIZE(SP)
264 ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
265 ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
269 #if defined(_AIX) || defined(__APPLE__)
272 lwz X, FRAMESLOT(0) + STACKSIZE(SP)
273 lwz INCX, FRAMESLOT(1) + STACKSIZE(SP)
274 lwz Y, FRAMESLOT(2) + STACKSIZE(SP)
275 lwz INCY, FRAMESLOT(3) + STACKSIZE(SP)
276 lwz BUFFER, FRAMESLOT(4) + STACKSIZE(SP)
278 lwz Y, FRAMESLOT(0) + STACKSIZE(SP)
279 lwz INCY, FRAMESLOT(1) + STACKSIZE(SP)
280 lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
283 ld Y, FRAMESLOT(0) + STACKSIZE(SP)
284 ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
285 ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
289 STFD alpha_r, ALPHA_R
290 STFD alpha_i, ALPHA_I
292 slwi LDA, LDA, ZBASE_SHIFT
293 slwi INCX, INCX, ZBASE_SHIFT
294 slwi INCY, INCY, ZBASE_SHIFT
296 li PREA, PREFETCHSIZE_A * SIZE
305 cmpwi cr0, INCX, 2 * SIZE
333 STFD a1, 0 * SIZE(BUFFER)
334 STFD a2, 1 * SIZE(BUFFER)
335 STFD a3, 2 * SIZE(BUFFER)
336 STFD a4, 3 * SIZE(BUFFER)
337 STFD a5, 4 * SIZE(BUFFER)
338 STFD a6, 5 * SIZE(BUFFER)
339 STFD a7, 6 * SIZE(BUFFER)
340 STFD a8, 7 * SIZE(BUFFER)
342 addi BUFFER, BUFFER, 8 * SIZE
357 STFD a1, 0 * SIZE(BUFFER)
358 STFD a2, 1 * SIZE(BUFFER)
360 addi BUFFER, BUFFER, 2 * SIZE
368 cmpwi cr0, INCY, 2 * SIZE
379 STFD f0, 0 * SIZE(BUFFER)
380 STFD f0, 1 * SIZE(BUFFER)
381 STFD f0, 2 * SIZE(BUFFER)
382 STFD f0, 3 * SIZE(BUFFER)
383 STFD f0, 4 * SIZE(BUFFER)
384 STFD f0, 5 * SIZE(BUFFER)
385 STFD f0, 6 * SIZE(BUFFER)
386 STFD f0, 7 * SIZE(BUFFER)
387 addi BUFFER, BUFFER, 8 * SIZE
402 slwi TEMP, IS, ZBASE_SHIFT
408 LFD xtemp1, 0 * SIZE(TEMP)
409 LFD xtemp2, 1 * SIZE(TEMP)
410 LFD xtemp3, 2 * SIZE(TEMP)
411 LFD xtemp4, 3 * SIZE(TEMP)
413 FMUL atemp1, y05, xtemp1
414 FMUL atemp2, y06, xtemp1
415 FMUL atemp3, y05, xtemp3
416 FMUL atemp4, y06, xtemp3
418 FNMSUB atemp1, y06, xtemp2, atemp1
419 FMADD atemp2, y05, xtemp2, atemp2
420 FNMSUB atemp3, y06, xtemp4, atemp3
421 FMADD atemp4, y05, xtemp4, atemp4
431 LFD a1, 0 * SIZE(AO1)
432 LFD a2, 1 * SIZE(AO1)
433 LFD a3, 2 * SIZE(AO1)
434 LFD a4, 3 * SIZE(AO1)
436 LFD a5, 0 * SIZE(AO2)
437 LFD a6, 1 * SIZE(AO2)
438 LFD a7, 2 * SIZE(AO2)
439 LFD a8, 3 * SIZE(AO2)
441 LFD xtemp1, 0 * SIZE(XX)
442 LFD xtemp2, 1 * SIZE(XX)
443 LFD xtemp3, 2 * SIZE(XX)
444 LFD xtemp4, 3 * SIZE(XX)
446 LFD y01, 0 * SIZE(YY)
447 LFD y02, 1 * SIZE(YY)
448 LFD y03, 2 * SIZE(YY)
449 LFD y04, 3 * SIZE(YY)
455 FMADD xsum1, xtemp1, a1, xsum1
457 FMADD y01, atemp1, a1, y01
460 FMADD xsum2, xtemp2, a1, xsum2
462 FMADD y02, atemp2, a1, y02
463 LFD a1, 4 * SIZE(AO1)
465 FMADD xsum3, xtemp1, a5, xsum3
467 FMADD y03, atemp1, a3, y03
470 FMADD xsum4, xtemp2, a5, xsum4
472 FMADD y04, atemp2, a3, y04
475 FMADD1 xsum1, xtemp2, a2, xsum1
476 LFD y05, 4 * SIZE(YY)
477 FNMSUB y01, atemp2, a2, y01
480 FMADD2 xsum2, xtemp1, a2, xsum2
481 LFD y06, 5 * SIZE(YY)
482 FMADD y02, atemp1, a2, y02
483 LFD a2, 5 * SIZE(AO1)
485 FMADD1 xsum3, xtemp2, a6, xsum3
486 LFD xtemp2, 5 * SIZE(XX)
487 FNMSUB y03, atemp2, a4, y03
490 FMADD2 xsum4, xtemp1, a6, xsum4
491 LFD xtemp1, 4 * SIZE(XX)
492 FMADD y04, atemp1, a4, y04
495 FMADD xsum1, xtemp3, a3, xsum1
496 LFD y07, 6 * SIZE(YY)
497 FMADD y01, atemp3, a5, y01
500 FMADD xsum2, xtemp4, a3, xsum2
501 LFD a3, 6 * SIZE(AO1)
502 FMADD y02, atemp4, a5, y02
503 LFD a5, 4 * SIZE(AO2)
505 FMADD xsum3, xtemp3, a7, xsum3
506 LFD y08, 7 * SIZE(YY)
507 FMADD y03, atemp3, a7, y03
510 FMADD xsum4, xtemp4, a7, xsum4
512 FMADD y04, atemp4, a7, y04
513 LFD a7, 6 * SIZE(AO2)
515 FMADD1 xsum1, xtemp4, a4, xsum1
517 FNMSUB y01, atemp4, a6, y01
521 FMADD2 xsum2, xtemp3, a4, xsum2
522 LFD a4, 7 * SIZE(AO1)
523 FMADD y02, atemp3, a6, y02
524 LFD a6, 5 * SIZE(AO2)
526 FMADD1 xsum3, xtemp4, a8, xsum3
527 LFD xtemp4, 7 * SIZE(XX)
528 FNMSUB y03, atemp4, a8, y03
531 FMADD2 xsum4, xtemp3, a8, xsum4
532 LFD xtemp3, 6 * SIZE(XX)
533 FMADD y04, atemp3, a8, y04
534 LFD a8, 7 * SIZE(AO2)
536 FMADD xsum1, xtemp1, a1, xsum1
537 STFD y01, 0 * SIZE(YY)
538 FMADD y05, atemp1, a1, y05
541 FMADD xsum2, xtemp2, a1, xsum2
542 STFD y02, 1 * SIZE(YY)
543 FMADD y06, atemp2, a1, y06
544 LFD a1, 8 * SIZE(AO1)
546 FMADD xsum3, xtemp1, a5, xsum3
547 STFD y03, 2 * SIZE(YY)
548 FMADD y07, atemp1, a3, y07
551 FMADD xsum4, xtemp2, a5, xsum4
552 STFD y04, 3 * SIZE(YY)
553 FMADD y08, atemp2, a3, y08
556 FMADD1 xsum1, xtemp2, a2, xsum1
557 LFD y01, 8 * SIZE(YY)
558 FNMSUB y05, atemp2, a2, y05
561 FMADD2 xsum2, xtemp1, a2, xsum2
562 LFD y02, 9 * SIZE(YY)
563 FMADD y06, atemp1, a2, y06
564 LFD a2, 9 * SIZE(AO1)
566 FMADD1 xsum3, xtemp2, a6, xsum3
567 LFD xtemp2, 9 * SIZE(XX)
568 FNMSUB y07, atemp2, a4, y07
571 FMADD2 xsum4, xtemp1, a6, xsum4
572 LFD xtemp1, 8 * SIZE(XX)
573 FMADD y08, atemp1, a4, y08
576 FMADD xsum1, xtemp3, a3, xsum1
577 LFD y03, 10 * SIZE(YY)
578 FMADD y05, atemp3, a5, y05
581 FMADD xsum2, xtemp4, a3, xsum2
582 LFD a3, 10 * SIZE(AO1)
583 FMADD y06, atemp4, a5, y06
584 LFD a5, 8 * SIZE(AO2)
586 FMADD xsum3, xtemp3, a7, xsum3
587 LFD y04, 11 * SIZE(YY)
588 FMADD y07, atemp3, a7, y07
591 FMADD xsum4, xtemp4, a7, xsum4
593 FMADD y08, atemp4, a7, y08
594 LFD a7, 10 * SIZE(AO2)
596 FMADD1 xsum1, xtemp4, a4, xsum1
598 FNMSUB y05, atemp4, a6, y05
601 FMADD2 xsum2, xtemp3, a4, xsum2
602 LFD a4, 11 * SIZE(AO1)
603 FMADD y06, atemp3, a6, y06
604 LFD a6, 9 * SIZE(AO2)
606 FMADD1 xsum3, xtemp4, a8, xsum3
607 LFD xtemp4, 11 * SIZE(XX)
608 FNMSUB y07, atemp4, a8, y07
613 FMADD2 xsum4, xtemp3, a8, xsum4
614 LFD xtemp3, 10 * SIZE(XX)
615 FMADD y08, atemp3, a8, y08
616 LFD a8, 11 * SIZE(AO2)
618 FMADD xsum1, xtemp1, a1, xsum1
619 STFD y05, 4 * SIZE(YY)
620 FMADD y01, atemp1, a1, y01
623 FMADD xsum2, xtemp2, a1, xsum2
624 STFD y06, 5 * SIZE(YY)
625 FMADD y02, atemp2, a1, y02
626 LFD a1, 12 * SIZE(AO1)
628 FMADD xsum3, xtemp1, a5, xsum3
629 STFD y07, 6 * SIZE(YY)
630 FMADD y03, atemp1, a3, y03
633 FMADD xsum4, xtemp2, a5, xsum4
634 STFD y08, 7 * SIZE(YY)
635 FMADD y04, atemp2, a3, y04
638 FMADD1 xsum1, xtemp2, a2, xsum1
639 LFD y05, 12 * SIZE(YY)
640 FNMSUB y01, atemp2, a2, y01
643 FMADD2 xsum2, xtemp1, a2, xsum2
644 LFD y06, 13 * SIZE(YY)
645 FMADD y02, atemp1, a2, y02
646 LFD a2, 13 * SIZE(AO1)
648 FMADD1 xsum3, xtemp2, a6, xsum3
649 LFD xtemp2, 13 * SIZE(XX)
650 FNMSUB y03, atemp2, a4, y03
653 FMADD2 xsum4, xtemp1, a6, xsum4
654 LFD xtemp1, 12 * SIZE(XX)
655 FMADD y04, atemp1, a4, y04
658 FMADD xsum1, xtemp3, a3, xsum1
659 LFD y07, 14 * SIZE(YY)
660 FMADD y01, atemp3, a5, y01
663 FMADD xsum2, xtemp4, a3, xsum2
664 LFD a3, 14 * SIZE(AO1)
665 FMADD y02, atemp4, a5, y02
666 LFD a5, 12 * SIZE(AO2)
668 FMADD xsum3, xtemp3, a7, xsum3
669 LFD y08, 15 * SIZE(YY)
670 FMADD y03, atemp3, a7, y03
673 FMADD xsum4, xtemp4, a7, xsum4
675 FMADD y04, atemp4, a7, y04
676 LFD a7, 14 * SIZE(AO2)
678 FMADD1 xsum1, xtemp4, a4, xsum1
680 FNMSUB y01, atemp4, a6, y01
684 FMADD2 xsum2, xtemp3, a4, xsum2
685 LFD a4, 15 * SIZE(AO1)
686 FMADD y02, atemp3, a6, y02
687 LFD a6, 13 * SIZE(AO2)
689 FMADD1 xsum3, xtemp4, a8, xsum3
690 LFD xtemp4, 15 * SIZE(XX)
691 FNMSUB y03, atemp4, a8, y03
694 FMADD2 xsum4, xtemp3, a8, xsum4
695 LFD xtemp3, 14 * SIZE(XX)
696 FMADD y04, atemp3, a8, y04
697 LFD a8, 15 * SIZE(AO2)
699 FMADD xsum1, xtemp1, a1, xsum1
700 STFD y01, 8 * SIZE(YY)
701 FMADD y05, atemp1, a1, y05
704 FMADD xsum2, xtemp2, a1, xsum2
705 STFD y02, 9 * SIZE(YY)
706 FMADD y06, atemp2, a1, y06
707 LFD a1, 16 * SIZE(AO1)
709 FMADD xsum3, xtemp1, a5, xsum3
710 STFD y03, 10 * SIZE(YY)
711 FMADD y07, atemp1, a3, y07
714 FMADD xsum4, xtemp2, a5, xsum4
715 STFD y04, 11 * SIZE(YY)
716 FMADD y08, atemp2, a3, y08
719 FMADD1 xsum1, xtemp2, a2, xsum1
720 LFD y01, 16 * SIZE(YY)
721 FNMSUB y05, atemp2, a2, y05
724 FMADD2 xsum2, xtemp1, a2, xsum2
725 LFD y02, 17 * SIZE(YY)
726 FMADD y06, atemp1, a2, y06
727 LFD a2, 17 * SIZE(AO1)
729 FMADD1 xsum3, xtemp2, a6, xsum3
730 LFD xtemp2, 17 * SIZE(XX)
731 FNMSUB y07, atemp2, a4, y07
734 FMADD2 xsum4, xtemp1, a6, xsum4
735 LFD xtemp1, 16 * SIZE(XX)
736 FMADD y08, atemp1, a4, y08
737 addi AO2, AO2, 16 * SIZE
739 FMADD xsum1, xtemp3, a3, xsum1
740 LFD y03, 18 * SIZE(YY)
741 FMADD y05, atemp3, a5, y05
742 addi XX, XX, 16 * SIZE
744 FMADD xsum2, xtemp4, a3, xsum2
745 LFD a3, 18 * SIZE(AO1)
746 FMADD y06, atemp4, a5, y06
747 LFD a5, 0 * SIZE(AO2)
749 FMADD xsum3, xtemp3, a7, xsum3
750 LFD y04, 19 * SIZE(YY)
751 FMADD y07, atemp3, a7, y07
754 FMADD xsum4, xtemp4, a7, xsum4
755 addi AO1, AO1, 16 * SIZE
756 FMADD y08, atemp4, a7, y08
757 LFD a7, 2 * SIZE(AO2)
759 FMADD1 xsum1, xtemp4, a4, xsum1
760 addi YY, YY, 16 * SIZE
761 FNMSUB y05, atemp4, a6, y05
764 FMADD2 xsum2, xtemp3, a4, xsum2
765 LFD a4, 3 * SIZE(AO1)
766 FMADD y06, atemp3, a6, y06
767 LFD a6, 1 * SIZE(AO2)
769 FMADD1 xsum3, xtemp4, a8, xsum3
770 LFD xtemp4, 3 * SIZE(XX)
771 FNMSUB y07, atemp4, a8, y07
774 FMADD2 xsum4, xtemp3, a8, xsum4
775 LFD xtemp3, 2 * SIZE(XX)
776 FMADD y08, atemp3, a8, y08
777 LFD a8, 3 * SIZE(AO2)
779 FMADD xsum1, xtemp1, a1, xsum1
780 STFD y05, -4 * SIZE(YY)
781 FMADD y01, atemp1, a1, y01
784 FMADD xsum2, xtemp2, a1, xsum2
785 STFD y06, -3 * SIZE(YY)
786 FMADD y02, atemp2, a1, y02
787 LFD a1, 4 * SIZE(AO1)
789 FMADD xsum3, xtemp1, a5, xsum3
790 STFD y07, -2 * SIZE(YY)
791 FMADD y03, atemp1, a3, y03
794 FMADD xsum4, xtemp2, a5, xsum4
795 STFD y08, -1 * SIZE(YY)
796 FMADD y04, atemp2, a3, y04
799 FMADD1 xsum1, xtemp2, a2, xsum1
800 LFD y05, 4 * SIZE(YY)
801 FNMSUB y01, atemp2, a2, y01
804 FMADD2 xsum2, xtemp1, a2, xsum2
805 LFD y06, 5 * SIZE(YY)
806 FMADD y02, atemp1, a2, y02
807 LFD a2, 5 * SIZE(AO1)
809 FMADD1 xsum3, xtemp2, a6, xsum3
810 LFD xtemp2, 5 * SIZE(XX)
811 FNMSUB y03, atemp2, a4, y03
814 FMADD2 xsum4, xtemp1, a6, xsum4
815 LFD xtemp1, 4 * SIZE(XX)
816 FMADD y04, atemp1, a4, y04
819 FMADD xsum1, xtemp3, a3, xsum1
820 LFD y07, 6 * SIZE(YY)
821 FMADD y01, atemp3, a5, y01
824 FMADD xsum2, xtemp4, a3, xsum2
825 LFD a3, 6 * SIZE(AO1)
826 FMADD y02, atemp4, a5, y02
827 LFD a5, 4 * SIZE(AO2)
829 FMADD xsum3, xtemp3, a7, xsum3
830 LFD y08, 7 * SIZE(YY)
831 FMADD y03, atemp3, a7, y03
834 FMADD xsum4, xtemp4, a7, xsum4
836 FMADD y04, atemp4, a7, y04
837 LFD a7, 6 * SIZE(AO2)
839 FMADD1 xsum1, xtemp4, a4, xsum1
841 FNMSUB y01, atemp4, a6, y01
845 FMADD2 xsum2, xtemp3, a4, xsum2
846 LFD a4, 7 * SIZE(AO1)
847 FMADD y02, atemp3, a6, y02
848 LFD a6, 5 * SIZE(AO2)
850 FMADD1 xsum3, xtemp4, a8, xsum3
851 LFD xtemp4, 7 * SIZE(XX)
852 FNMSUB y03, atemp4, a8, y03
855 FMADD2 xsum4, xtemp3, a8, xsum4
856 LFD xtemp3, 6 * SIZE(XX)
857 FMADD y04, atemp3, a8, y04
858 LFD a8, 7 * SIZE(AO2)
860 FMADD xsum1, xtemp1, a1, xsum1
861 STFD y01, 0 * SIZE(YY)
862 FMADD y05, atemp1, a1, y05
865 FMADD xsum2, xtemp2, a1, xsum2
866 STFD y02, 1 * SIZE(YY)
867 FMADD y06, atemp2, a1, y06
868 LFD a1, 8 * SIZE(AO1)
870 FMADD xsum3, xtemp1, a5, xsum3
871 STFD y03, 2 * SIZE(YY)
872 FMADD y07, atemp1, a3, y07
875 FMADD xsum4, xtemp2, a5, xsum4
876 STFD y04, 3 * SIZE(YY)
877 FMADD y08, atemp2, a3, y08
880 FMADD1 xsum1, xtemp2, a2, xsum1
881 LFD y01, 8 * SIZE(YY)
882 FNMSUB y05, atemp2, a2, y05
885 FMADD2 xsum2, xtemp1, a2, xsum2
886 LFD y02, 9 * SIZE(YY)
887 FMADD y06, atemp1, a2, y06
888 LFD a2, 9 * SIZE(AO1)
890 FMADD1 xsum3, xtemp2, a6, xsum3
891 LFD xtemp2, 9 * SIZE(XX)
892 FNMSUB y07, atemp2, a4, y07
895 FMADD2 xsum4, xtemp1, a6, xsum4
896 LFD xtemp1, 8 * SIZE(XX)
897 FMADD y08, atemp1, a4, y08
900 FMADD xsum1, xtemp3, a3, xsum1
901 LFD y03, 10 * SIZE(YY)
902 FMADD y05, atemp3, a5, y05
905 FMADD xsum2, xtemp4, a3, xsum2
906 LFD a3, 10 * SIZE(AO1)
907 FMADD y06, atemp4, a5, y06
908 LFD a5, 8 * SIZE(AO2)
910 FMADD xsum3, xtemp3, a7, xsum3
911 LFD y04, 11 * SIZE(YY)
912 FMADD y07, atemp3, a7, y07
915 FMADD xsum4, xtemp4, a7, xsum4
917 FMADD y08, atemp4, a7, y08
918 LFD a7, 10 * SIZE(AO2)
920 FMADD1 xsum1, xtemp4, a4, xsum1
922 FNMSUB y05, atemp4, a6, y05
925 FMADD2 xsum2, xtemp3, a4, xsum2
926 LFD a4, 11 * SIZE(AO1)
927 FMADD y06, atemp3, a6, y06
928 LFD a6, 9 * SIZE(AO2)
930 FMADD1 xsum3, xtemp4, a8, xsum3
931 LFD xtemp4, 11 * SIZE(XX)
932 FNMSUB y07, atemp4, a8, y07
937 FMADD2 xsum4, xtemp3, a8, xsum4
938 LFD xtemp3, 10 * SIZE(XX)
939 FMADD y08, atemp3, a8, y08
940 LFD a8, 11 * SIZE(AO2)
942 FMADD xsum1, xtemp1, a1, xsum1
943 STFD y05, 4 * SIZE(YY)
944 FMADD y01, atemp1, a1, y01
947 FMADD xsum2, xtemp2, a1, xsum2
948 STFD y06, 5 * SIZE(YY)
949 FMADD y02, atemp2, a1, y02
950 LFD a1, 12 * SIZE(AO1)
952 FMADD xsum3, xtemp1, a5, xsum3
953 STFD y07, 6 * SIZE(YY)
954 FMADD y03, atemp1, a3, y03
957 FMADD xsum4, xtemp2, a5, xsum4
958 STFD y08, 7 * SIZE(YY)
959 FMADD y04, atemp2, a3, y04
962 FMADD1 xsum1, xtemp2, a2, xsum1
963 LFD y05, 12 * SIZE(YY)
964 FNMSUB y01, atemp2, a2, y01
967 FMADD2 xsum2, xtemp1, a2, xsum2
968 LFD y06, 13 * SIZE(YY)
969 FMADD y02, atemp1, a2, y02
970 LFD a2, 13 * SIZE(AO1)
972 FMADD1 xsum3, xtemp2, a6, xsum3
973 LFD xtemp2, 13 * SIZE(XX)
974 FNMSUB y03, atemp2, a4, y03
977 FMADD2 xsum4, xtemp1, a6, xsum4
978 LFD xtemp1, 12 * SIZE(XX)
979 FMADD y04, atemp1, a4, y04
982 FMADD xsum1, xtemp3, a3, xsum1
983 LFD y07, 14 * SIZE(YY)
984 FMADD y01, atemp3, a5, y01
987 FMADD xsum2, xtemp4, a3, xsum2
988 LFD a3, 14 * SIZE(AO1)
989 FMADD y02, atemp4, a5, y02
990 LFD a5, 12 * SIZE(AO2)
992 FMADD xsum3, xtemp3, a7, xsum3
993 LFD y08, 15 * SIZE(YY)
994 FMADD y03, atemp3, a7, y03
997 FMADD xsum4, xtemp4, a7, xsum4
999 FMADD y04, atemp4, a7, y04
1000 LFD a7, 14 * SIZE(AO2)
1002 FMADD1 xsum1, xtemp4, a4, xsum1
1004 FNMSUB y01, atemp4, a6, y01
1007 FMADD2 xsum2, xtemp3, a4, xsum2
1008 LFD a4, 15 * SIZE(AO1)
1009 FMADD y02, atemp3, a6, y02
1010 LFD a6, 13 * SIZE(AO2)
1012 FMADD1 xsum3, xtemp4, a8, xsum3
1013 LFD xtemp4, 15 * SIZE(XX)
1014 FNMSUB y03, atemp4, a8, y03
1017 FMADD2 xsum4, xtemp3, a8, xsum4
1018 LFD xtemp3, 14 * SIZE(XX)
1019 FMADD y04, atemp3, a8, y04
1020 LFD a8, 15 * SIZE(AO2)
1022 FMADD xsum1, xtemp1, a1, xsum1
1023 STFD y01, 8 * SIZE(YY)
1024 FMADD y05, atemp1, a1, y05
1027 FMADD xsum2, xtemp2, a1, xsum2
1028 STFD y02, 9 * SIZE(YY)
1029 FMADD y06, atemp2, a1, y06
1030 LFD a1, 16 * SIZE(AO1)
1032 FMADD xsum3, xtemp1, a5, xsum3
1033 STFD y03, 10 * SIZE(YY)
1034 FMADD y07, atemp1, a3, y07
1037 FMADD xsum4, xtemp2, a5, xsum4
1038 STFD y04, 11 * SIZE(YY)
1039 FMADD y08, atemp2, a3, y08
1042 FMADD1 xsum1, xtemp2, a2, xsum1
1043 LFD y01, 16 * SIZE(YY)
1044 FNMSUB y05, atemp2, a2, y05
1047 FMADD2 xsum2, xtemp1, a2, xsum2
1048 LFD y02, 17 * SIZE(YY)
1049 FMADD y06, atemp1, a2, y06
1050 LFD a2, 17 * SIZE(AO1)
1052 FMADD1 xsum3, xtemp2, a6, xsum3
1053 LFD xtemp2, 17 * SIZE(XX)
1054 FNMSUB y07, atemp2, a4, y07
1057 FMADD2 xsum4, xtemp1, a6, xsum4
1058 LFD xtemp1, 16 * SIZE(XX)
1059 FMADD y08, atemp1, a4, y08
1060 addi AO2, AO2, 16 * SIZE
1062 FMADD xsum1, xtemp3, a3, xsum1
1063 LFD y03, 18 * SIZE(YY)
1064 FMADD y05, atemp3, a5, y05
1065 addi XX, XX, 16 * SIZE
1067 FMADD xsum2, xtemp4, a3, xsum2
1068 LFD a3, 18 * SIZE(AO1)
1069 FMADD y06, atemp4, a5, y06
1070 LFD a5, 0 * SIZE(AO2)
1072 FMADD xsum3, xtemp3, a7, xsum3
1073 LFD y04, 19 * SIZE(YY)
1074 FMADD y07, atemp3, a7, y07
1077 FMADD xsum4, xtemp4, a7, xsum4
1078 addi AO1, AO1, 16 * SIZE
1079 FMADD y08, atemp4, a7, y08
1080 LFD a7, 2 * SIZE(AO2)
1082 FMADD1 xsum1, xtemp4, a4, xsum1
1083 addi YY, YY, 16 * SIZE
1084 FNMSUB y05, atemp4, a6, y05
1087 FMADD2 xsum2, xtemp3, a4, xsum2
1088 LFD a4, 3 * SIZE(AO1)
1089 FMADD y06, atemp3, a6, y06
1090 LFD a6, 1 * SIZE(AO2)
1092 FMADD1 xsum3, xtemp4, a8, xsum3
1093 LFD xtemp4, 3 * SIZE(XX)
1094 FNMSUB y07, atemp4, a8, y07
1097 FMADD2 xsum4, xtemp3, a8, xsum4
1098 LFD xtemp3, 2 * SIZE(XX)
1099 FMADD y08, atemp3, a8, y08
1100 LFD a8, 3 * SIZE(AO2)
1102 STFD y05, -4 * SIZE(YY)
1103 STFD y06, -3 * SIZE(YY)
1104 STFD y07, -2 * SIZE(YY)
1105 STFD y08, -1 * SIZE(YY)
1112 FMADD xsum1, xtemp1, a1, xsum1
1114 FMADD y01, atemp1, a1, y01
1117 FMADD xsum2, xtemp2, a1, xsum2
1119 FMADD y02, atemp2, a1, y02
1120 LFD a1, 4 * SIZE(AO1)
1122 FMADD xsum3, xtemp1, a5, xsum3
1124 FMADD y03, atemp1, a3, y03
1127 FMADD xsum4, xtemp2, a5, xsum4
1129 FMADD y04, atemp2, a3, y04
1132 FMADD1 xsum1, xtemp2, a2, xsum1
1133 LFD y05, 4 * SIZE(YY)
1134 FNMSUB y01, atemp2, a2, y01
1137 FMADD2 xsum2, xtemp1, a2, xsum2
1138 LFD y06, 5 * SIZE(YY)
1139 FMADD y02, atemp1, a2, y02
1140 LFD a2, 5 * SIZE(AO1)
1142 FMADD1 xsum3, xtemp2, a6, xsum3
1143 LFD xtemp2, 5 * SIZE(XX)
1144 FNMSUB y03, atemp2, a4, y03
1147 FMADD2 xsum4, xtemp1, a6, xsum4
1148 LFD xtemp1, 4 * SIZE(XX)
1149 FMADD y04, atemp1, a4, y04
1152 FMADD xsum1, xtemp3, a3, xsum1
1153 LFD y07, 6 * SIZE(YY)
1154 FMADD y01, atemp3, a5, y01
1157 FMADD xsum2, xtemp4, a3, xsum2
1158 LFD a3, 6 * SIZE(AO1)
1159 FMADD y02, atemp4, a5, y02
1160 LFD a5, 4 * SIZE(AO2)
1162 FMADD xsum3, xtemp3, a7, xsum3
1163 LFD y08, 7 * SIZE(YY)
1164 FMADD y03, atemp3, a7, y03
1167 FMADD xsum4, xtemp4, a7, xsum4
1169 FMADD y04, atemp4, a7, y04
1170 LFD a7, 6 * SIZE(AO2)
1172 FMADD1 xsum1, xtemp4, a4, xsum1
1174 FNMSUB y01, atemp4, a6, y01
1177 FMADD2 xsum2, xtemp3, a4, xsum2
1178 LFD a4, 7 * SIZE(AO1)
1179 FMADD y02, atemp3, a6, y02
1180 LFD a6, 5 * SIZE(AO2)
1182 FMADD1 xsum3, xtemp4, a8, xsum3
1183 LFD xtemp4, 7 * SIZE(XX)
1184 FNMSUB y03, atemp4, a8, y03
1187 FMADD2 xsum4, xtemp3, a8, xsum4
1188 LFD xtemp3, 6 * SIZE(XX)
1189 FMADD y04, atemp3, a8, y04
1190 LFD a8, 7 * SIZE(AO2)
1192 FMADD xsum1, xtemp1, a1, xsum1
1193 STFD y01, 0 * SIZE(YY)
1194 FMADD y05, atemp1, a1, y05
1197 FMADD xsum2, xtemp2, a1, xsum2
1198 STFD y02, 1 * SIZE(YY)
1199 FMADD y06, atemp2, a1, y06
1200 LFD a1, 8 * SIZE(AO1)
1202 FMADD xsum3, xtemp1, a5, xsum3
1203 STFD y03, 2 * SIZE(YY)
1204 FMADD y07, atemp1, a3, y07
1207 FMADD xsum4, xtemp2, a5, xsum4
1208 STFD y04, 3 * SIZE(YY)
1209 FMADD y08, atemp2, a3, y08
1212 FMADD1 xsum1, xtemp2, a2, xsum1
1213 LFD y01, 8 * SIZE(YY)
1214 FNMSUB y05, atemp2, a2, y05
1217 FMADD2 xsum2, xtemp1, a2, xsum2
1218 LFD y02, 9 * SIZE(YY)
1219 FMADD y06, atemp1, a2, y06
1220 LFD a2, 9 * SIZE(AO1)
1222 FMADD1 xsum3, xtemp2, a6, xsum3
1223 LFD xtemp2, 9 * SIZE(XX)
1224 FNMSUB y07, atemp2, a4, y07
1227 FMADD2 xsum4, xtemp1, a6, xsum4
1228 LFD xtemp1, 8 * SIZE(XX)
1229 FMADD y08, atemp1, a4, y08
1232 FMADD xsum1, xtemp3, a3, xsum1
1233 LFD y03, 10 * SIZE(YY)
1234 FMADD y05, atemp3, a5, y05
1237 FMADD xsum2, xtemp4, a3, xsum2
1238 LFD a3, 10 * SIZE(AO1)
1239 FMADD y06, atemp4, a5, y06
1240 LFD a5, 8 * SIZE(AO2)
1242 FMADD xsum3, xtemp3, a7, xsum3
1243 LFD y04, 11 * SIZE(YY)
1244 FMADD y07, atemp3, a7, y07
1247 FMADD xsum4, xtemp4, a7, xsum4
1249 FMADD y08, atemp4, a7, y08
1250 LFD a7, 10 * SIZE(AO2)
1252 FMADD1 xsum1, xtemp4, a4, xsum1
1254 FNMSUB y05, atemp4, a6, y05
1257 FMADD2 xsum2, xtemp3, a4, xsum2
1258 LFD a4, 11 * SIZE(AO1)
1259 FMADD y06, atemp3, a6, y06
1260 LFD a6, 9 * SIZE(AO2)
1262 FMADD1 xsum3, xtemp4, a8, xsum3
1263 LFD xtemp4, 11 * SIZE(XX)
1264 FNMSUB y07, atemp4, a8, y07
1266 FMADD2 xsum4, xtemp3, a8, xsum4
1267 LFD xtemp3, 10 * SIZE(XX)
1268 FMADD y08, atemp3, a8, y08
1269 LFD a8, 11 * SIZE(AO2)
1271 STFD y05, 4 * SIZE(YY)
1272 STFD y06, 5 * SIZE(YY)
1273 STFD y07, 6 * SIZE(YY)
1274 STFD y08, 7 * SIZE(YY)
1276 addi AO1, AO1, 8 * SIZE
1277 addi AO2, AO2, 8 * SIZE
1279 addi XX, XX, 8 * SIZE
1280 addi YY, YY, 8 * SIZE
1287 FMADD xsum1, xtemp1, a1, xsum1
1288 FMADD y01, atemp1, a1, y01
1289 FMADD xsum2, xtemp2, a1, xsum2
1290 FMADD y02, atemp2, a1, y02
1291 FMADD xsum3, xtemp1, a5, xsum3
1292 FMADD y03, atemp1, a3, y03
1293 FMADD xsum4, xtemp2, a5, xsum4
1294 FMADD y04, atemp2, a3, y04
1296 FMADD1 xsum1, xtemp2, a2, xsum1
1297 FNMSUB y01, atemp2, a2, y01
1298 FMADD2 xsum2, xtemp1, a2, xsum2
1299 FMADD y02, atemp1, a2, y02
1300 FMADD1 xsum3, xtemp2, a6, xsum3
1301 FNMSUB y03, atemp2, a4, y03
1302 FMADD2 xsum4, xtemp1, a6, xsum4
1303 FMADD y04, atemp1, a4, y04
1305 FMADD xsum1, xtemp3, a3, xsum1
1306 FMADD y01, atemp3, a5, y01
1307 FMADD xsum2, xtemp4, a3, xsum2
1308 FMADD y02, atemp4, a5, y02
1309 FMADD xsum3, xtemp3, a7, xsum3
1310 FMADD y03, atemp3, a7, y03
1311 FMADD xsum4, xtemp4, a7, xsum4
1312 FMADD y04, atemp4, a7, y04
1314 FMADD1 xsum1, xtemp4, a4, xsum1
1315 FNMSUB y01, atemp4, a6, y01
1316 FMADD2 xsum2, xtemp3, a4, xsum2
1317 FMADD y02, atemp3, a6, y02
1318 FMADD1 xsum3, xtemp4, a8, xsum3
1319 FNMSUB y03, atemp4, a8, y03
1320 FMADD2 xsum4, xtemp3, a8, xsum4
1321 FMADD y04, atemp3, a8, y04
1323 STFD y01, 0 * SIZE(YY)
1324 STFD y02, 1 * SIZE(YY)
1325 STFD y03, 2 * SIZE(YY)
1326 STFD y04, 3 * SIZE(YY)
1328 LFD a1, 4 * SIZE(AO1)
1329 LFD a2, 5 * SIZE(AO1)
1331 LFD a5, 4 * SIZE(AO2)
1332 LFD a6, 5 * SIZE(AO2)
1333 LFD a7, 6 * SIZE(AO2)
1334 LFD a8, 7 * SIZE(AO2)
1336 LFD y01, 4 * SIZE(YY)
1337 LFD y02, 5 * SIZE(YY)
1338 LFD y03, 6 * SIZE(YY)
1339 LFD y04, 7 * SIZE(YY)
1341 addi YY, YY, 4 * SIZE
1348 FMUL xtemp1, y05, xsum1
1349 FMUL xtemp2, y06, xsum1
1350 FMUL xtemp3, y05, xsum3
1351 FMUL xtemp4, y06, xsum3
1353 FNMSUB xsum1, y06, xsum2, xtemp1
1354 FMADD xsum2, y05, xsum2, xtemp2
1355 FNMSUB xsum3, y06, xsum4, xtemp3
1356 FMADD xsum4, y05, xsum4, xtemp4
1358 FMADD xsum1, atemp1, a1, xsum1
1359 FMADD xsum2, atemp2, a1, xsum2
1360 FMADD xsum3, atemp1, a5, xsum3
1361 FMADD xsum4, atemp2, a5, xsum4
1364 FMADD1 xsum1, atemp2, a2, xsum1
1365 FMADD2 xsum2, atemp1, a2, xsum2
1367 FMADD1 xsum3, atemp2, a6, xsum3
1368 FMADD2 xsum4, atemp1, a6, xsum4
1370 FMADD xsum1, atemp3, a5, xsum1
1371 FMADD xsum2, atemp4, a5, xsum2
1372 FMADD xsum3, atemp3, a7, xsum3
1373 FMADD xsum4, atemp4, a7, xsum4
1375 FNMSUB xsum1, atemp4, a6, xsum1
1376 FMADD xsum2, atemp3, a6, xsum2
1378 FNMSUB xsum3, atemp4, a8, xsum3
1379 FMADD xsum4, atemp3, a8, xsum4
1382 FADD y01, y01, xsum1
1383 FADD y02, y02, xsum2
1384 FADD y03, y03, xsum3
1385 FADD y04, y04, xsum4
1387 STFD y01, 0 * SIZE(YY)
1389 STFD y02, 1 * SIZE(YY)
1391 STFD y03, 2 * SIZE(YY)
1393 STFD y04, 3 * SIZE(YY)
1403 slwi TEMP, IS, ZBASE_SHIFT
1409 LFD xtemp1, 0 * SIZE(TEMP)
1410 LFD xtemp2, 1 * SIZE(TEMP)
1412 FMUL atemp1, y05, xtemp1
1413 FMUL atemp2, y06, xtemp1
1415 FNMSUB atemp1, y06, xtemp2, atemp1
1416 FMADD atemp2, y05, xtemp2, atemp2
1424 LFD a1, 0 * SIZE(AO1)
1425 LFD a2, 1 * SIZE(AO1)
1427 LFD xtemp1, 0 * SIZE(XX)
1428 LFD xtemp2, 1 * SIZE(XX)
1430 LFD y01, 0 * SIZE(YY)
1431 LFD y02, 1 * SIZE(YY)
1439 FMADD xsum1, xtemp1, a1, xsum1
1440 FMADD y01, atemp1, a1, y01
1441 FMADD xsum2, xtemp2, a1, xsum2
1442 FMADD y02, atemp2, a1, y02
1443 LFD a1, 2 * SIZE(AO1)
1445 FMADD1 xsum1, xtemp2, a2, xsum1
1446 LFD xtemp2, 3 * SIZE(XX)
1447 FNMSUB y01, atemp2, a2, y01
1448 FMADD2 xsum2, xtemp1, a2, xsum2
1449 LFD xtemp1, 2 * SIZE(XX)
1450 FMADD y02, atemp1, a2, y02
1451 LFD a2, 3 * SIZE(AO1)
1453 addi AO1, AO1, 2 * SIZE
1454 addi XX, XX, 2 * SIZE
1455 addi YY, YY, 2 * SIZE
1457 STFD y01, -2 * SIZE(YY)
1458 LFD y01, 0 * SIZE(YY)
1459 STFD y02, -1 * SIZE(YY)
1460 LFD y02, 1 * SIZE(YY)
1468 FMUL xtemp1, y05, xsum1
1469 FMUL xtemp2, y06, xsum1
1471 FNMSUB xsum1, y06, xsum2, xtemp1
1472 FMADD xsum2, y05, xsum2, xtemp2
1474 FMADD xsum1, atemp1, a1, xsum1
1475 FMADD xsum2, atemp2, a1, xsum2
1478 FNMSUB xsum1, atemp2, a2, xsum1
1479 FMADD xsum2, atemp1, a2, xsum2
1482 FADD y01, y01, xsum1
1483 FADD y02, y02, xsum2
1485 STFD y01, 0 * SIZE(YY)
1486 STFD y02, 1 * SIZE(YY)
1490 cmpwi cr0, INCY, 2 * SIZE
1514 LFD f8, 0 * SIZE(NEW_Y)
1515 LFD f9, 1 * SIZE(NEW_Y)
1516 LFD f10, 2 * SIZE(NEW_Y)
1517 LFD f11, 3 * SIZE(NEW_Y)
1518 LFD f12, 4 * SIZE(NEW_Y)
1519 LFD f13, 5 * SIZE(NEW_Y)
1520 LFD f14, 6 * SIZE(NEW_Y)
1521 LFD f15, 7 * SIZE(NEW_Y)
1522 addi NEW_Y, NEW_Y, 8 * SIZE
1533 STFD f8, 0 * SIZE(YY)
1534 STFD f9, 1 * SIZE(YY)
1536 STFD f10, 0 * SIZE(YY)
1537 STFD f11, 1 * SIZE(YY)
1539 STFD f12, 0 * SIZE(YY)
1540 STFD f13, 1 * SIZE(YY)
1542 STFD f14, 0 * SIZE(YY)
1543 STFD f15, 1 * SIZE(YY)
1559 LFD f8, 0 * SIZE(NEW_Y)
1560 LFD f9, 1 * SIZE(NEW_Y)
1561 LFD f10, 2 * SIZE(NEW_Y)
1562 LFD f11, 3 * SIZE(NEW_Y)
1563 addi NEW_Y, NEW_Y, 4 * SIZE
1570 STFD f8, 0 * SIZE(YY)
1571 STFD f9, 1 * SIZE(YY)
1573 STFD f10, 0 * SIZE(YY)
1574 STFD f11, 1 * SIZE(YY)
1585 LFD f8, 0 * SIZE(NEW_Y)
1586 LFD f9, 1 * SIZE(NEW_Y)
1591 STFD f8, 0 * SIZE(YY)
1592 STFD f9, 1 * SIZE(YY)
1649 addi SP, SP, STACKSIZE