1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
72 #if defined(_AIX) || defined(__APPLE__)
73 #if !defined(__64BIT__) && defined(DOUBLE)
109 #define Y1 r23 /* dummy; should be same as gemv_n.S */
110 #define Y2 r24 /* dummy; should be same as gemv_n.S */
113 #define PREFETCHSIZE_A 34
114 #define PREFETCHSIZE_C 16
117 #if defined(PPC440) || defined(PPC440FP2)
118 #define PREFETCHSIZE_A 34
119 #define PREFETCHSIZE_C 16
123 #define PREFETCHSIZE_A 56
124 #define PREFETCHSIZE_C 16
128 #define PREFETCHSIZE_A 56
129 #define PREFETCHSIZE_C 16
133 #define PREFETCHSIZE_A 34
134 #define PREFETCHSIZE_C 16
138 #define PREFETCHSIZE_A 40
139 #define PREFETCHSIZE_C 8
143 #define PREFETCHSIZE_A 24
144 #define PREFETCHSIZE_C 8
148 #define PREFETCHSIZE_A 24
149 #define PREFETCHSIZE_C 8
152 #if !(defined(CONJ) && defined(XCONJ))
154 #define FMSUBR FNMSUB
156 #define FMADDR FNMSUB
163 #define FZERO 200(SP)
164 #define ALPHA_R 208(SP)
165 #define ALPHA_I 216(SP)
167 #define FZERO 256(SP)
168 #define ALPHA_R 264(SP)
169 #define ALPHA_I 272(SP)
175 addi SP, SP, -STACKSIZE
230 lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
231 lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
233 ld INCX, FRAMESLOT(0) + STACKSIZE(SP)
234 ld Y, FRAMESLOT(1) + STACKSIZE(SP)
235 ld INCY, FRAMESLOT(2) + STACKSIZE(SP)
236 ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
240 #if defined(_AIX) || defined(__APPLE__)
243 lwz LDA, FRAMESLOT(0) + STACKSIZE(SP)
244 lwz X, FRAMESLOT(1) + STACKSIZE(SP)
245 lwz INCX, FRAMESLOT(2) + STACKSIZE(SP)
246 lwz Y, FRAMESLOT(3) + STACKSIZE(SP)
247 lwz INCY, FRAMESLOT(4) + STACKSIZE(SP)
248 lwz BUFFER, FRAMESLOT(5) + STACKSIZE(SP)
250 lwz INCX, FRAMESLOT(0) + STACKSIZE(SP)
251 lwz Y, FRAMESLOT(1) + STACKSIZE(SP)
252 lwz INCY, FRAMESLOT(2) + STACKSIZE(SP)
253 lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
256 ld INCX, FRAMESLOT(0) + STACKSIZE(SP)
257 ld Y, FRAMESLOT(1) + STACKSIZE(SP)
258 ld INCY, FRAMESLOT(2) + STACKSIZE(SP)
259 ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
268 subf PLDA_M, XP, PLDA_M
269 slwi PLDA_M, PLDA_M, ZBASE_SHIFT
271 slwi LDA, LDA, ZBASE_SHIFT
272 slwi INCX, INCX, ZBASE_SHIFT
273 slwi INCY, INCY, ZBASE_SHIFT
277 li PREA, PREFETCHSIZE_A * SIZE
278 li PREC, PREFETCHSIZE_C * SIZE
288 slwi r0, IS, ZBASE_SHIFT
289 cmpi cr0, 0, MIN_N, P
294 cmpwi cr0, INCX, 2 * SIZE
298 addi CO, BUFFER, -SIZE
319 STFD f0, 1 * SIZE(CO)
320 STFD f1, 2 * SIZE(CO)
321 STFD f2, 3 * SIZE(CO)
322 STFD f3, 4 * SIZE(CO)
323 STFD f4, 5 * SIZE(CO)
324 STFD f5, 6 * SIZE(CO)
325 STFD f6, 7 * SIZE(CO)
326 STFDU f7, 8 * SIZE(CO)
340 STFD f0, 1 * SIZE(CO)
341 STFDU f1, 2 * SIZE(CO)
383 LFD f16, 0 * SIZE(AO1)
384 LFD f17, 1 * SIZE(AO1)
385 LFD f18, 0 * SIZE(AO2)
386 LFD f19, 1 * SIZE(AO2)
387 LFD f20, 0 * SIZE(AO3)
388 LFD f21, 1 * SIZE(AO3)
389 LFD f22, 0 * SIZE(AO4)
390 LFD f23, 1 * SIZE(AO4)
392 LFD f24, 1 * SIZE(BO)
393 LFD f25, 2 * SIZE(BO)
394 LFD f26, 3 * SIZE(BO)
395 LFD f27, 4 * SIZE(BO)
396 LFD f28, 5 * SIZE(BO)
397 LFD f29, 6 * SIZE(BO)
398 LFD f30, 7 * SIZE(BO)
399 LFD f31, 8 * SIZE(BO)
401 bdz LL(MainKernelSkip)
405 FMADD f0, f16, f24, f0
406 FMADD f1, f16, f25, f1
407 FMADD f2, f17, f24, f2
408 FMADD f3, f17, f25, f3
410 FMADD f4, f18, f24, f4
411 FMADD f5, f18, f25, f5
412 FMADD f6, f19, f24, f6
413 FMADD f7, f19, f25, f7
415 LFD f16, 2 * SIZE(AO1)
416 LFD f17, 3 * SIZE(AO1)
417 LFD f18, 2 * SIZE(AO2)
418 LFD f19, 3 * SIZE(AO2)
420 FMADD f8, f20, f24, f8
421 FMADD f9, f20, f25, f9
422 FMADD f10, f21, f24, f10
423 FMADD f11, f21, f25, f11
425 FMADD f12, f22, f24, f12
426 FMADD f13, f22, f25, f13
427 FMADD f14, f23, f24, f14
428 FMADD f15, f23, f25, f15
430 LFD f20, 2 * SIZE(AO3)
431 LFD f21, 3 * SIZE(AO3)
432 LFD f22, 2 * SIZE(AO4)
433 LFD f23, 3 * SIZE(AO4)
435 FMADD f0, f16, f26, f0
436 FMADD f1, f16, f27, f1
437 FMADD f2, f17, f26, f2
438 FMADD f3, f17, f27, f3
440 FMADD f4, f18, f26, f4
441 FMADD f5, f18, f27, f5
442 FMADD f6, f19, f26, f6
443 FMADD f7, f19, f27, f7
445 LFD f16, 4 * SIZE(AO1)
446 LFD f17, 5 * SIZE(AO1)
447 LFD f18, 4 * SIZE(AO2)
448 LFD f19, 5 * SIZE(AO2)
450 FMADD f8, f20, f26, f8
451 FMADD f9, f20, f27, f9
452 FMADD f10, f21, f26, f10
453 FMADD f11, f21, f27, f11
455 FMADD f12, f22, f26, f12
456 FMADD f13, f22, f27, f13
457 FMADD f14, f23, f26, f14
458 FMADD f15, f23, f27, f15
460 LFD f20, 4 * SIZE(AO3)
461 LFD f21, 5 * SIZE(AO3)
462 LFD f22, 4 * SIZE(AO4)
463 LFD f23, 5 * SIZE(AO4)
465 LFD f24, 9 * SIZE(BO)
466 LFD f25, 10 * SIZE(BO)
467 LFD f26, 11 * SIZE(BO)
468 LFD f27, 12 * SIZE(BO)
470 FMADD f0, f16, f28, f0
471 FMADD f1, f16, f29, f1
472 FMADD f2, f17, f28, f2
473 FMADD f3, f17, f29, f3
475 FMADD f4, f18, f28, f4
476 FMADD f5, f18, f29, f5
477 FMADD f6, f19, f28, f6
478 FMADD f7, f19, f29, f7
480 LFD f16, 6 * SIZE(AO1)
481 LFD f17, 7 * SIZE(AO1)
482 LFD f18, 6 * SIZE(AO2)
483 LFD f19, 7 * SIZE(AO2)
485 FMADD f8, f20, f28, f8
486 FMADD f9, f20, f29, f9
487 FMADD f10, f21, f28, f10
488 FMADD f11, f21, f29, f11
490 FMADD f12, f22, f28, f12
491 FMADD f13, f22, f29, f13
492 FMADD f14, f23, f28, f14
493 FMADD f15, f23, f29, f15
495 LFD f20, 6 * SIZE(AO3)
496 LFD f21, 7 * SIZE(AO3)
497 LFD f22, 6 * SIZE(AO4)
498 LFD f23, 7 * SIZE(AO4)
500 FMADD f0, f16, f30, f0
501 FMADD f1, f16, f31, f1
502 FMADD f2, f17, f30, f2
503 FMADD f3, f17, f31, f3
505 FMADD f4, f18, f30, f4
506 FMADD f5, f18, f31, f5
507 FMADD f6, f19, f30, f6
508 FMADD f7, f19, f31, f7
510 LFD f16, 8 * SIZE(AO1)
511 LFD f17, 9 * SIZE(AO1)
512 LFD f18, 8 * SIZE(AO2)
513 LFD f19, 9 * SIZE(AO2)
515 FMADD f8, f20, f30, f8
516 FMADD f9, f20, f31, f9
517 FMADD f10, f21, f30, f10
518 FMADD f11, f21, f31, f11
520 FMADD f12, f22, f30, f12
521 FMADD f13, f22, f31, f13
522 FMADD f14, f23, f30, f14
523 FMADD f15, f23, f31, f15
525 LFD f20, 8 * SIZE(AO3)
526 LFD f21, 9 * SIZE(AO3)
527 LFD f22, 8 * SIZE(AO4)
528 LFD f23, 9 * SIZE(AO4)
530 LFD f28, 13 * SIZE(BO)
531 LFD f29, 14 * SIZE(BO)
532 LFD f30, 15 * SIZE(BO)
533 LFD f31, 16 * SIZE(BO)
535 FMADD f0, f16, f24, f0
536 FMADD f1, f16, f25, f1
537 FMADD f2, f17, f24, f2
538 FMADD f3, f17, f25, f3
540 FMADD f4, f18, f24, f4
541 FMADD f5, f18, f25, f5
542 FMADD f6, f19, f24, f6
543 FMADD f7, f19, f25, f7
545 LFD f16, 10 * SIZE(AO1)
546 LFD f17, 11 * SIZE(AO1)
547 LFD f18, 10 * SIZE(AO2)
548 LFD f19, 11 * SIZE(AO2)
550 FMADD f8, f20, f24, f8
551 FMADD f9, f20, f25, f9
552 FMADD f10, f21, f24, f10
553 FMADD f11, f21, f25, f11
555 FMADD f12, f22, f24, f12
556 FMADD f13, f22, f25, f13
557 FMADD f14, f23, f24, f14
558 FMADD f15, f23, f25, f15
560 LFD f20, 10 * SIZE(AO3)
561 LFD f21, 11 * SIZE(AO3)
562 LFD f22, 10 * SIZE(AO4)
563 LFD f23, 11 * SIZE(AO4)
565 FMADD f0, f16, f26, f0
566 FMADD f1, f16, f27, f1
567 FMADD f2, f17, f26, f2
568 FMADD f3, f17, f27, f3
570 FMADD f4, f18, f26, f4
571 FMADD f5, f18, f27, f5
572 FMADD f6, f19, f26, f6
573 FMADD f7, f19, f27, f7
575 LFD f16, 12 * SIZE(AO1)
576 LFD f17, 13 * SIZE(AO1)
577 LFD f18, 12 * SIZE(AO2)
578 LFD f19, 13 * SIZE(AO2)
580 FMADD f8, f20, f26, f8
581 FMADD f9, f20, f27, f9
582 FMADD f10, f21, f26, f10
583 FMADD f11, f21, f27, f11
585 FMADD f12, f22, f26, f12
586 FMADD f13, f22, f27, f13
587 FMADD f14, f23, f26, f14
588 FMADD f15, f23, f27, f15
590 LFD f20, 12 * SIZE(AO3)
591 LFD f21, 13 * SIZE(AO3)
592 LFD f22, 12 * SIZE(AO4)
593 LFD f23, 13 * SIZE(AO4)
595 LFD f24, 17 * SIZE(BO)
596 LFD f25, 18 * SIZE(BO)
597 LFD f26, 19 * SIZE(BO)
598 LFD f27, 20 * SIZE(BO)
600 FMADD f0, f16, f28, f0
601 FMADD f1, f16, f29, f1
602 FMADD f2, f17, f28, f2
603 FMADD f3, f17, f29, f3
605 FMADD f4, f18, f28, f4
606 FMADD f5, f18, f29, f5
607 FMADD f6, f19, f28, f6
608 FMADD f7, f19, f29, f7
610 LFD f16, 14 * SIZE(AO1)
611 LFD f17, 15 * SIZE(AO1)
612 LFD f18, 14 * SIZE(AO2)
613 LFD f19, 15 * SIZE(AO2)
615 FMADD f8, f20, f28, f8
616 FMADD f9, f20, f29, f9
617 FMADD f10, f21, f28, f10
618 FMADD f11, f21, f29, f11
620 FMADD f12, f22, f28, f12
621 FMADD f13, f22, f29, f13
622 FMADD f14, f23, f28, f14
623 FMADD f15, f23, f29, f15
625 LFD f20, 14 * SIZE(AO3)
626 LFD f21, 15 * SIZE(AO3)
627 LFD f22, 14 * SIZE(AO4)
628 LFD f23, 15 * SIZE(AO4)
630 FMADD f0, f16, f30, f0
631 FMADD f1, f16, f31, f1
632 FMADD f2, f17, f30, f2
633 FMADD f3, f17, f31, f3
635 FMADD f4, f18, f30, f4
636 FMADD f5, f18, f31, f5
637 FMADD f6, f19, f30, f6
638 FMADD f7, f19, f31, f7
640 LFD f16, 16 * SIZE(AO1)
641 LFD f17, 17 * SIZE(AO1)
642 LFD f18, 16 * SIZE(AO2)
643 LFD f19, 17 * SIZE(AO2)
645 addi AO1, AO1, 16 * SIZE
646 addi AO2, AO2, 16 * SIZE
650 FMADD f8, f20, f30, f8
651 FMADD f9, f20, f31, f9
652 FMADD f10, f21, f30, f10
653 FMADD f11, f21, f31, f11
655 FMADD f12, f22, f30, f12
656 FMADD f13, f22, f31, f13
657 FMADD f14, f23, f30, f14
658 FMADD f15, f23, f31, f15
660 LFD f20, 16 * SIZE(AO3)
661 LFD f21, 17 * SIZE(AO3)
662 LFD f22, 16 * SIZE(AO4)
663 LFD f23, 17 * SIZE(AO4)
665 LFD f28, 21 * SIZE(BO)
666 LFD f29, 22 * SIZE(BO)
667 LFD f30, 23 * SIZE(BO)
668 LFD f31, 24 * SIZE(BO)
670 addi AO3, AO3, 16 * SIZE
671 addi AO4, AO4, 16 * SIZE
675 addi BO, BO, 16 * SIZE
680 FMADD f0, f16, f24, f0
681 FMADD f1, f16, f25, f1
682 FMADD f2, f17, f24, f2
683 FMADD f3, f17, f25, f3
685 FMADD f4, f18, f24, f4
686 FMADD f5, f18, f25, f5
687 FMADD f6, f19, f24, f6
688 FMADD f7, f19, f25, f7
690 LFD f16, 2 * SIZE(AO1)
691 LFD f17, 3 * SIZE(AO1)
692 LFD f18, 2 * SIZE(AO2)
693 LFD f19, 3 * SIZE(AO2)
695 FMADD f8, f20, f24, f8
696 FMADD f9, f20, f25, f9
697 FMADD f10, f21, f24, f10
698 FMADD f11, f21, f25, f11
700 FMADD f12, f22, f24, f12
701 FMADD f13, f22, f25, f13
702 FMADD f14, f23, f24, f14
703 FMADD f15, f23, f25, f15
705 LFD f20, 2 * SIZE(AO3)
706 LFD f21, 3 * SIZE(AO3)
707 LFD f22, 2 * SIZE(AO4)
708 LFD f23, 3 * SIZE(AO4)
710 FMADD f0, f16, f26, f0
711 FMADD f1, f16, f27, f1
712 FMADD f2, f17, f26, f2
713 FMADD f3, f17, f27, f3
715 FMADD f4, f18, f26, f4
716 FMADD f5, f18, f27, f5
717 FMADD f6, f19, f26, f6
718 FMADD f7, f19, f27, f7
720 LFD f16, 4 * SIZE(AO1)
721 LFD f17, 5 * SIZE(AO1)
722 LFD f18, 4 * SIZE(AO2)
723 LFD f19, 5 * SIZE(AO2)
725 FMADD f8, f20, f26, f8
726 FMADD f9, f20, f27, f9
727 FMADD f10, f21, f26, f10
728 FMADD f11, f21, f27, f11
730 FMADD f12, f22, f26, f12
731 FMADD f13, f22, f27, f13
732 FMADD f14, f23, f26, f14
733 FMADD f15, f23, f27, f15
735 LFD f20, 4 * SIZE(AO3)
736 LFD f21, 5 * SIZE(AO3)
737 LFD f22, 4 * SIZE(AO4)
738 LFD f23, 5 * SIZE(AO4)
740 FMADD f0, f16, f28, f0
741 FMADD f1, f16, f29, f1
742 FMADD f2, f17, f28, f2
743 FMADD f3, f17, f29, f3
745 FMADD f4, f18, f28, f4
746 FMADD f5, f18, f29, f5
747 FMADD f6, f19, f28, f6
748 FMADD f7, f19, f29, f7
750 LFD f16, 6 * SIZE(AO1)
751 LFD f17, 7 * SIZE(AO1)
752 LFD f18, 6 * SIZE(AO2)
753 LFD f19, 7 * SIZE(AO2)
755 FMADD f8, f20, f28, f8
756 FMADD f9, f20, f29, f9
757 FMADD f10, f21, f28, f10
758 FMADD f11, f21, f29, f11
760 FMADD f12, f22, f28, f12
761 FMADD f13, f22, f29, f13
762 FMADD f14, f23, f28, f14
763 FMADD f15, f23, f29, f15
765 LFD f20, 6 * SIZE(AO3)
766 LFD f21, 7 * SIZE(AO3)
767 LFD f22, 6 * SIZE(AO4)
768 LFD f23, 7 * SIZE(AO4)
770 FMADD f0, f16, f30, f0
771 FMADD f1, f16, f31, f1
772 FMADD f2, f17, f30, f2
773 FMADD f3, f17, f31, f3
775 FMADD f4, f18, f30, f4
776 FMADD f5, f18, f31, f5
777 FMADD f6, f19, f30, f6
778 FMADD f7, f19, f31, f7
780 LFD f16, 8 * SIZE(AO1)
781 LFD f17, 9 * SIZE(AO1)
782 LFD f18, 8 * SIZE(AO2)
783 LFD f19, 9 * SIZE(AO2)
785 FMADD f8, f20, f30, f8
786 FMADD f9, f20, f31, f9
787 FMADD f10, f21, f30, f10
788 FMADD f11, f21, f31, f11
790 FMADD f12, f22, f30, f12
791 FMADD f13, f22, f31, f13
792 FMADD f14, f23, f30, f14
793 FMADD f15, f23, f31, f15
795 LFD f20, 8 * SIZE(AO3)
796 LFD f21, 9 * SIZE(AO3)
797 LFD f22, 8 * SIZE(AO4)
798 LFD f23, 9 * SIZE(AO4)
800 LFD f24, 9 * SIZE(BO)
801 LFD f25, 10 * SIZE(BO)
802 LFD f26, 11 * SIZE(BO)
803 LFD f27, 12 * SIZE(BO)
805 LFD f28, 13 * SIZE(BO)
806 LFD f29, 14 * SIZE(BO)
807 LFD f30, 15 * SIZE(BO)
808 LFDU f31, 16 * SIZE(BO)
810 FMADD f0, f16, f24, f0
811 FMADD f1, f16, f25, f1
812 FMADD f2, f17, f24, f2
813 FMADD f3, f17, f25, f3
815 FMADD f4, f18, f24, f4
816 FMADD f5, f18, f25, f5
817 FMADD f6, f19, f24, f6
818 FMADD f7, f19, f25, f7
820 LFD f16, 10 * SIZE(AO1)
821 LFD f17, 11 * SIZE(AO1)
822 LFD f18, 10 * SIZE(AO2)
823 LFD f19, 11 * SIZE(AO2)
825 FMADD f8, f20, f24, f8
826 FMADD f9, f20, f25, f9
827 FMADD f10, f21, f24, f10
828 FMADD f11, f21, f25, f11
830 FMADD f12, f22, f24, f12
831 FMADD f13, f22, f25, f13
832 FMADD f14, f23, f24, f14
833 FMADD f15, f23, f25, f15
835 LFD f20, 10 * SIZE(AO3)
836 LFD f21, 11 * SIZE(AO3)
837 LFD f22, 10 * SIZE(AO4)
838 LFD f23, 11 * SIZE(AO4)
840 FMADD f0, f16, f26, f0
841 FMADD f1, f16, f27, f1
842 FMADD f2, f17, f26, f2
843 FMADD f3, f17, f27, f3
845 FMADD f4, f18, f26, f4
846 FMADD f5, f18, f27, f5
847 FMADD f6, f19, f26, f6
848 FMADD f7, f19, f27, f7
850 LFD f16, 12 * SIZE(AO1)
851 LFD f17, 13 * SIZE(AO1)
852 LFD f18, 12 * SIZE(AO2)
853 LFD f19, 13 * SIZE(AO2)
855 FMADD f8, f20, f26, f8
856 FMADD f9, f20, f27, f9
857 FMADD f10, f21, f26, f10
858 FMADD f11, f21, f27, f11
860 FMADD f12, f22, f26, f12
861 FMADD f13, f22, f27, f13
862 FMADD f14, f23, f26, f14
863 FMADD f15, f23, f27, f15
865 LFD f20, 12 * SIZE(AO3)
866 LFD f21, 13 * SIZE(AO3)
867 LFD f22, 12 * SIZE(AO4)
868 LFD f23, 13 * SIZE(AO4)
870 FMADD f0, f16, f28, f0
871 FMADD f1, f16, f29, f1
872 FMADD f2, f17, f28, f2
873 FMADD f3, f17, f29, f3
875 FMADD f4, f18, f28, f4
876 FMADD f5, f18, f29, f5
877 FMADD f6, f19, f28, f6
878 FMADD f7, f19, f29, f7
880 LFD f16, 14 * SIZE(AO1)
881 LFD f17, 15 * SIZE(AO1)
882 LFD f18, 14 * SIZE(AO2)
883 LFD f19, 15 * SIZE(AO2)
885 FMADD f8, f20, f28, f8
886 FMADD f9, f20, f29, f9
887 FMADD f10, f21, f28, f10
888 FMADD f11, f21, f29, f11
890 FMADD f12, f22, f28, f12
891 FMADD f13, f22, f29, f13
892 FMADD f14, f23, f28, f14
893 FMADD f15, f23, f29, f15
895 LFD f20, 14 * SIZE(AO3)
896 LFD f21, 15 * SIZE(AO3)
897 LFD f22, 14 * SIZE(AO4)
898 LFD f23, 15 * SIZE(AO4)
900 addi AO1, AO1, 16 * SIZE
901 addi AO2, AO2, 16 * SIZE
902 addi AO3, AO3, 16 * SIZE
903 addi AO4, AO4, 16 * SIZE
905 FMADD f0, f16, f30, f0
906 FMADD f1, f16, f31, f1
907 FMADD f2, f17, f30, f2
908 FMADD f3, f17, f31, f3
910 FMADD f4, f18, f30, f4
911 FMADD f5, f18, f31, f5
912 FMADD f6, f19, f30, f6
913 FMADD f7, f19, f31, f7
915 FMADD f8, f20, f30, f8
916 FMADD f9, f20, f31, f9
917 FMADD f10, f21, f30, f10
918 FMADD f11, f21, f31, f11
920 FMADD f12, f22, f30, f12
921 FMADD f13, f22, f31, f13
922 FMADD f14, f23, f30, f14
923 FMADD f15, f23, f31, f15
932 LFD f16, 0 * SIZE(AO1)
933 LFD f17, 1 * SIZE(AO1)
934 LFD f18, 0 * SIZE(AO2)
935 LFD f19, 1 * SIZE(AO2)
936 LFD f20, 0 * SIZE(AO3)
937 LFD f21, 1 * SIZE(AO3)
938 LFD f22, 0 * SIZE(AO4)
939 LFD f23, 1 * SIZE(AO4)
941 LFD f24, 1 * SIZE(BO)
942 LFDU f25, 2 * SIZE(BO)
944 addi AO1, AO1, 2 * SIZE
945 addi AO2, AO2, 2 * SIZE
946 addi AO3, AO3, 2 * SIZE
947 addi AO4, AO4, 2 * SIZE
949 bdz LL(MainN3KernelSkip)
953 FMADD f0, f16, f24, f0
954 FMADD f1, f16, f25, f1
955 FMADD f2, f17, f24, f2
956 FMADD f3, f17, f25, f3
958 FMADD f4, f18, f24, f4
959 FMADD f5, f18, f25, f5
960 FMADD f6, f19, f24, f6
961 FMADD f7, f19, f25, f7
963 LFD f16, 0 * SIZE(AO1)
964 LFD f17, 1 * SIZE(AO1)
965 LFD f18, 0 * SIZE(AO2)
966 LFD f19, 1 * SIZE(AO2)
968 FMADD f8, f20, f24, f8
969 FMADD f9, f20, f25, f9
970 FMADD f10, f21, f24, f10
971 FMADD f11, f21, f25, f11
973 FMADD f12, f22, f24, f12
974 FMADD f13, f22, f25, f13
975 FMADD f14, f23, f24, f14
976 FMADD f15, f23, f25, f15
978 LFD f20, 0 * SIZE(AO3)
979 LFD f21, 1 * SIZE(AO3)
980 LFD f22, 0 * SIZE(AO4)
981 LFD f23, 1 * SIZE(AO4)
983 LFD f24, 1 * SIZE(BO)
984 LFDU f25, 2 * SIZE(BO)
986 addi AO1, AO1, 2 * SIZE
987 addi AO2, AO2, 2 * SIZE
988 addi AO3, AO3, 2 * SIZE
989 addi AO4, AO4, 2 * SIZE
991 bdnz LL(MainN3Kernel)
994 LL(MainN3KernelSkip):
995 FMADD f0, f16, f24, f0
996 FMADD f1, f16, f25, f1
997 FMADD f2, f17, f24, f2
998 FMADD f3, f17, f25, f3
1000 FMADD f4, f18, f24, f4
1001 FMADD f5, f18, f25, f5
1002 FMADD f6, f19, f24, f6
1003 FMADD f7, f19, f25, f7
1005 FMADD f8, f20, f24, f8
1006 FMADD f9, f20, f25, f9
1007 FMADD f10, f21, f24, f10
1008 FMADD f11, f21, f25, f11
1010 FMADD f12, f22, f24, f12
1011 FMADD f13, f22, f25, f13
1012 FMADD f14, f23, f24, f14
1013 FMADD f15, f23, f25, f15
1063 cmpwi cr0, INCY, 2 * SIZE
1066 LFD f16, 0 * SIZE(CO)
1067 LFD f17, 1 * SIZE(CO)
1068 LFD f18, 2 * SIZE(CO)
1069 LFD f19, 3 * SIZE(CO)
1070 LFD f20, 4 * SIZE(CO)
1071 LFD f21, 5 * SIZE(CO)
1072 LFD f22, 6 * SIZE(CO)
1073 LFD f23, 7 * SIZE(CO)
1075 FMADD f16, f30, f0, f16
1076 FMADDR f17, f30, f1, f17
1077 FMADD f18, f30, f4, f18
1078 FMADDR f19, f30, f5, f19
1080 FMADD f20, f30, f8, f20
1081 FMADDR f21, f30, f9, f21
1082 FMADD f22, f30, f12, f22
1083 FMADDR f23, f30, f13, f23
1085 FMSUBR f16, f31, f1, f16
1086 FMADD f17, f31, f0, f17
1087 FMSUBR f18, f31, f5, f18
1088 FMADD f19, f31, f4, f19
1090 FMSUBR f20, f31, f9, f20
1091 FMADD f21, f31, f8, f21
1092 FMSUBR f22, f31, f13, f22
1093 FMADD f23, f31, f12, f23
1095 STFD f16, 0 * SIZE(CO)
1096 STFD f17, 1 * SIZE(CO)
1097 STFD f18, 2 * SIZE(CO)
1098 STFD f19, 3 * SIZE(CO)
1100 STFD f20, 4 * SIZE(CO)
1101 STFD f21, 5 * SIZE(CO)
1102 STFD f22, 6 * SIZE(CO)
1103 STFD f23, 7 * SIZE(CO)
1105 addi CO, CO, 8 * SIZE
1114 LFD f16, 0 * SIZE(CO)
1115 LFD f17, 1 * SIZE(CO)
1118 LFD f18, 0 * SIZE(CO)
1119 LFD f19, 1 * SIZE(CO)
1122 LFD f20, 0 * SIZE(CO)
1123 LFD f21, 1 * SIZE(CO)
1126 LFD f22, 0 * SIZE(CO)
1127 LFD f23, 1 * SIZE(CO)
1130 FMADD f16, f30, f0, f16
1131 FMADDR f17, f30, f1, f17
1132 FMADD f18, f30, f4, f18
1133 FMADDR f19, f30, f5, f19
1135 FMADD f20, f30, f8, f20
1136 FMADDR f21, f30, f9, f21
1137 FMADD f22, f30, f12, f22
1138 FMADDR f23, f30, f13, f23
1140 FMSUBR f16, f31, f1, f16
1141 FMADD f17, f31, f0, f17
1142 FMSUBR f18, f31, f5, f18
1143 FMADD f19, f31, f4, f19
1145 FMSUBR f20, f31, f9, f20
1146 FMADD f21, f31, f8, f21
1147 FMSUBR f22, f31, f13, f22
1148 FMADD f23, f31, f12, f23
1150 STFD f16, 0 * SIZE(BO)
1151 STFD f17, 1 * SIZE(BO)
1153 STFD f18, 0 * SIZE(BO)
1154 STFD f19, 1 * SIZE(BO)
1157 STFD f20, 0 * SIZE(BO)
1158 STFD f21, 1 * SIZE(BO)
1160 STFD f22, 0 * SIZE(BO)
1161 STFD f23, 1 * SIZE(BO)
1195 srawi. r0 , MIN_N, 3
1199 LFD f16, 0 * SIZE(AO1)
1200 LFD f17, 1 * SIZE(AO1)
1201 LFD f18, 2 * SIZE(AO1)
1202 LFD f19, 3 * SIZE(AO1)
1204 LFD f20, 4 * SIZE(AO1)
1205 LFD f21, 5 * SIZE(AO1)
1206 LFD f22, 6 * SIZE(AO1)
1207 LFD f23, 7 * SIZE(AO1)
1209 LFD f24, 1 * SIZE(BO)
1210 LFD f25, 2 * SIZE(BO)
1211 LFD f26, 3 * SIZE(BO)
1212 LFD f27, 4 * SIZE(BO)
1214 LFD f28, 5 * SIZE(BO)
1215 LFD f29, 6 * SIZE(BO)
1216 LFD f30, 7 * SIZE(BO)
1217 LFD f31, 8 * SIZE(BO)
1219 bdz LL(RemainKernelSkip)
1223 FMADD f0, f16, f24, f0
1224 FMADD f1, f16, f25, f1
1225 FMADD f2, f17, f24, f2
1226 FMADD f3, f17, f25, f3
1228 FMADD f4, f18, f26, f4
1229 FMADD f5, f18, f27, f5
1230 FMADD f6, f19, f26, f6
1231 FMADD f7, f19, f27, f7
1233 LFD f16, 8 * SIZE(AO1)
1234 LFD f17, 9 * SIZE(AO1)
1235 LFD f18, 10 * SIZE(AO1)
1236 LFD f19, 11 * SIZE(AO1)
1238 LFD f24, 9 * SIZE(BO)
1239 LFD f25, 10 * SIZE(BO)
1240 LFD f26, 11 * SIZE(BO)
1241 LFD f27, 12 * SIZE(BO)
1243 FMADD f8, f20, f28, f8
1244 FMADD f9, f20, f29, f9
1245 FMADD f10, f21, f28, f10
1246 FMADD f11, f21, f29, f11
1248 FMADD f12, f22, f30, f12
1249 FMADD f13, f22, f31, f13
1250 FMADD f14, f23, f30, f14
1251 FMADD f15, f23, f31, f15
1253 LFD f20, 12 * SIZE(AO1)
1254 LFD f21, 13 * SIZE(AO1)
1255 LFD f22, 14 * SIZE(AO1)
1256 LFD f23, 15 * SIZE(AO1)
1258 LFD f28, 13 * SIZE(BO)
1259 LFD f29, 14 * SIZE(BO)
1260 LFD f30, 15 * SIZE(BO)
1261 LFD f31, 16 * SIZE(BO)
1263 FMADD f0, f16, f24, f0
1264 FMADD f1, f16, f25, f1
1265 FMADD f2, f17, f24, f2
1266 FMADD f3, f17, f25, f3
1268 FMADD f4, f18, f26, f4
1269 FMADD f5, f18, f27, f5
1270 FMADD f6, f19, f26, f6
1271 FMADD f7, f19, f27, f7
1273 LFD f16, 16 * SIZE(AO1)
1274 LFD f17, 17 * SIZE(AO1)
1275 LFD f18, 18 * SIZE(AO1)
1276 LFD f19, 19 * SIZE(AO1)
1278 LFD f24, 17 * SIZE(BO)
1279 LFD f25, 18 * SIZE(BO)
1280 LFD f26, 19 * SIZE(BO)
1281 LFD f27, 20 * SIZE(BO)
1283 FMADD f8, f20, f28, f8
1284 FMADD f9, f20, f29, f9
1285 FMADD f10, f21, f28, f10
1286 FMADD f11, f21, f29, f11
1288 FMADD f12, f22, f30, f12
1289 FMADD f13, f22, f31, f13
1290 FMADD f14, f23, f30, f14
1291 FMADD f15, f23, f31, f15
1293 LFD f20, 20 * SIZE(AO1)
1294 LFD f21, 21 * SIZE(AO1)
1295 LFD f22, 22 * SIZE(AO1)
1296 LFD f23, 23 * SIZE(AO1)
1298 LFD f28, 21 * SIZE(BO)
1299 LFD f29, 22 * SIZE(BO)
1300 LFD f30, 23 * SIZE(BO)
1301 LFD f31, 24 * SIZE(BO)
1303 addi AO1, AO1, 16 * SIZE
1304 addi BO, BO, 16 * SIZE
1308 bdnz LL(RemainKernel)
1311 LL(RemainKernelSkip):
1312 FMADD f0, f16, f24, f0
1313 FMADD f1, f16, f25, f1
1314 FMADD f2, f17, f24, f2
1315 FMADD f3, f17, f25, f3
1317 FMADD f4, f18, f26, f4
1318 FMADD f5, f18, f27, f5
1319 FMADD f6, f19, f26, f6
1320 FMADD f7, f19, f27, f7
1322 LFD f16, 8 * SIZE(AO1)
1323 LFD f17, 9 * SIZE(AO1)
1324 LFD f18, 10 * SIZE(AO1)
1325 LFD f19, 11 * SIZE(AO1)
1327 LFD f24, 9 * SIZE(BO)
1328 LFD f25, 10 * SIZE(BO)
1329 LFD f26, 11 * SIZE(BO)
1330 LFD f27, 12 * SIZE(BO)
1332 FMADD f8, f20, f28, f8
1333 FMADD f9, f20, f29, f9
1334 FMADD f10, f21, f28, f10
1335 FMADD f11, f21, f29, f11
1337 FMADD f12, f22, f30, f12
1338 FMADD f13, f22, f31, f13
1339 FMADD f14, f23, f30, f14
1340 FMADD f15, f23, f31, f15
1342 LFD f20, 12 * SIZE(AO1)
1343 LFD f21, 13 * SIZE(AO1)
1344 LFD f22, 14 * SIZE(AO1)
1345 LFD f23, 15 * SIZE(AO1)
1347 LFD f28, 13 * SIZE(BO)
1348 LFD f29, 14 * SIZE(BO)
1349 LFD f30, 15 * SIZE(BO)
1350 LFDU f31, 16 * SIZE(BO)
1352 FMADD f0, f16, f24, f0
1353 FMADD f1, f16, f25, f1
1354 FMADD f2, f17, f24, f2
1355 FMADD f3, f17, f25, f3
1357 FMADD f4, f18, f26, f4
1358 FMADD f5, f18, f27, f5
1359 FMADD f6, f19, f26, f6
1360 FMADD f7, f19, f27, f7
1362 FMADD f8, f20, f28, f8
1363 FMADD f9, f20, f29, f9
1364 FMADD f10, f21, f28, f10
1365 FMADD f11, f21, f29, f11
1367 FMADD f12, f22, f30, f12
1368 FMADD f13, f22, f31, f13
1369 FMADD f14, f23, f30, f14
1370 FMADD f15, f23, f31, f15
1372 addi AO1, AO1, 16 * SIZE
1378 ble LL(RemainFinish)
1381 LFD f16, 0 * SIZE(AO1)
1382 LFD f17, 1 * SIZE(AO1)
1383 LFD f24, 1 * SIZE(BO)
1384 LFDU f25, 2 * SIZE(BO)
1385 addi AO1, AO1, 2 * SIZE
1386 bdz LL(RemainN3KernelSkip)
1390 FMADD f0, f16, f24, f0
1391 FMADD f1, f16, f25, f1
1392 FMADD f2, f17, f24, f2
1393 FMADD f3, f17, f25, f3
1395 LFD f16, 0 * SIZE(AO1)
1396 LFD f17, 1 * SIZE(AO1)
1397 LFD f24, 1 * SIZE(BO)
1398 LFDU f25, 2 * SIZE(BO)
1399 addi AO1, AO1, 2 * SIZE
1400 bdnz LL(RemainN3Kernel)
1403 LL(RemainN3KernelSkip):
1404 FMADD f0, f16, f24, f0
1405 FMADD f1, f16, f25, f1
1406 FMADD f2, f17, f24, f2
1407 FMADD f3, f17, f25, f3
1413 LFD f16, 0 * SIZE(CO)
1414 LFD f17, 1 * SIZE(CO)
1449 FMADD f16, f30, f0, f16
1450 FMADDR f17, f30, f1, f17
1451 FMSUBR f16, f31, f1, f16
1452 FMADD f17, f31, f0, f17
1454 STFD f16, 0 * SIZE(CO)
1455 STFD f17, 1 * SIZE(CO)
1521 addi SP, SP, STACKSIZE