1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
72 #if defined(_AIX) || defined(__APPLE__)
73 #if !defined(__64BIT__) && defined(DOUBLE)
109 #define Y1 r23 /* dummy; should be same as gemv_n.S */
110 #define Y2 r24 /* dummy; should be same as gemv_n.S */
113 #define PREFETCHSIZE_A 34
114 #define PREFETCHSIZE_C 16
117 #if defined(PPC440) || defined(PPC440FP2)
118 #define PREFETCHSIZE_A 34
119 #define PREFETCHSIZE_C 16
123 #define PREFETCHSIZE_A 56
124 #define PREFETCHSIZE_C 16
128 #define PREFETCHSIZE_A 56
129 #define PREFETCHSIZE_C 16
133 #define PREFETCHSIZE_A 34
134 #define PREFETCHSIZE_C 16
138 #define PREFETCHSIZE_A 40
139 #define PREFETCHSIZE_C 8
143 #define PREFETCHSIZE_A 24
144 #define PREFETCHSIZE_C 8
147 #if !(defined(CONJ) && defined(XCONJ))
149 #define FMSUBR FNMSUB
151 #define FMADDR FNMSUB
158 #define FZERO 200(SP)
159 #define ALPHA_R 208(SP)
160 #define ALPHA_I 216(SP)
162 #define FZERO 256(SP)
163 #define ALPHA_R 264(SP)
164 #define ALPHA_I 272(SP)
170 addi SP, SP, -STACKSIZE
225 lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
226 lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
228 ld INCX, FRAMESLOT(0) + STACKSIZE(SP)
229 ld Y, FRAMESLOT(1) + STACKSIZE(SP)
230 ld INCY, FRAMESLOT(2) + STACKSIZE(SP)
231 ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
235 #if defined(_AIX) || defined(__APPLE__)
238 lwz LDA, FRAMESLOT(0) + STACKSIZE(SP)
239 lwz X, FRAMESLOT(1) + STACKSIZE(SP)
240 lwz INCX, FRAMESLOT(2) + STACKSIZE(SP)
241 lwz Y, FRAMESLOT(3) + STACKSIZE(SP)
242 lwz INCY, FRAMESLOT(4) + STACKSIZE(SP)
243 lwz BUFFER, FRAMESLOT(5) + STACKSIZE(SP)
245 lwz INCX, FRAMESLOT(0) + STACKSIZE(SP)
246 lwz Y, FRAMESLOT(1) + STACKSIZE(SP)
247 lwz INCY, FRAMESLOT(2) + STACKSIZE(SP)
248 lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
251 ld INCX, FRAMESLOT(0) + STACKSIZE(SP)
252 ld Y, FRAMESLOT(1) + STACKSIZE(SP)
253 ld INCY, FRAMESLOT(2) + STACKSIZE(SP)
254 ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
263 subf PLDA_M, XP, PLDA_M
264 slwi PLDA_M, PLDA_M, ZBASE_SHIFT
266 slwi LDA, LDA, ZBASE_SHIFT
267 slwi INCX, INCX, ZBASE_SHIFT
268 slwi INCY, INCY, ZBASE_SHIFT
272 li PREA, PREFETCHSIZE_A * SIZE
273 li PREC, PREFETCHSIZE_C * SIZE
283 slwi r0, IS, ZBASE_SHIFT
284 cmpi cr0, 0, MIN_N, P
289 cmpwi cr0, INCX, 2 * SIZE
293 addi CO, BUFFER, -SIZE
314 STFD f0, 1 * SIZE(CO)
315 STFD f1, 2 * SIZE(CO)
316 STFD f2, 3 * SIZE(CO)
317 STFD f3, 4 * SIZE(CO)
318 STFD f4, 5 * SIZE(CO)
319 STFD f5, 6 * SIZE(CO)
320 STFD f6, 7 * SIZE(CO)
321 STFDU f7, 8 * SIZE(CO)
335 STFD f0, 1 * SIZE(CO)
336 STFDU f1, 2 * SIZE(CO)
378 LFD f16, 0 * SIZE(AO1)
379 LFD f17, 1 * SIZE(AO1)
380 LFD f18, 0 * SIZE(AO2)
381 LFD f19, 1 * SIZE(AO2)
382 LFD f20, 0 * SIZE(AO3)
383 LFD f21, 1 * SIZE(AO3)
384 LFD f22, 0 * SIZE(AO4)
385 LFD f23, 1 * SIZE(AO4)
387 LFD f24, 1 * SIZE(BO)
388 LFD f25, 2 * SIZE(BO)
389 LFD f26, 3 * SIZE(BO)
390 LFD f27, 4 * SIZE(BO)
391 LFD f28, 5 * SIZE(BO)
392 LFD f29, 6 * SIZE(BO)
393 LFD f30, 7 * SIZE(BO)
394 LFD f31, 8 * SIZE(BO)
396 bdz LL(MainKernelSkip)
400 FMADD f0, f16, f24, f0
401 FMADD f1, f16, f25, f1
402 FMADD f2, f17, f24, f2
403 FMADD f3, f17, f25, f3
405 FMADD f4, f18, f24, f4
406 FMADD f5, f18, f25, f5
407 FMADD f6, f19, f24, f6
408 FMADD f7, f19, f25, f7
410 LFD f16, 2 * SIZE(AO1)
411 LFD f17, 3 * SIZE(AO1)
412 LFD f18, 2 * SIZE(AO2)
413 LFD f19, 3 * SIZE(AO2)
415 FMADD f8, f20, f24, f8
416 FMADD f9, f20, f25, f9
417 FMADD f10, f21, f24, f10
418 FMADD f11, f21, f25, f11
420 FMADD f12, f22, f24, f12
421 FMADD f13, f22, f25, f13
422 FMADD f14, f23, f24, f14
423 FMADD f15, f23, f25, f15
425 LFD f20, 2 * SIZE(AO3)
426 LFD f21, 3 * SIZE(AO3)
427 LFD f22, 2 * SIZE(AO4)
428 LFD f23, 3 * SIZE(AO4)
430 FMADD f0, f16, f26, f0
431 FMADD f1, f16, f27, f1
432 FMADD f2, f17, f26, f2
433 FMADD f3, f17, f27, f3
435 FMADD f4, f18, f26, f4
436 FMADD f5, f18, f27, f5
437 FMADD f6, f19, f26, f6
438 FMADD f7, f19, f27, f7
440 LFD f16, 4 * SIZE(AO1)
441 LFD f17, 5 * SIZE(AO1)
442 LFD f18, 4 * SIZE(AO2)
443 LFD f19, 5 * SIZE(AO2)
445 FMADD f8, f20, f26, f8
446 FMADD f9, f20, f27, f9
447 FMADD f10, f21, f26, f10
448 FMADD f11, f21, f27, f11
450 FMADD f12, f22, f26, f12
451 FMADD f13, f22, f27, f13
452 FMADD f14, f23, f26, f14
453 FMADD f15, f23, f27, f15
455 LFD f20, 4 * SIZE(AO3)
456 LFD f21, 5 * SIZE(AO3)
457 LFD f22, 4 * SIZE(AO4)
458 LFD f23, 5 * SIZE(AO4)
460 LFD f24, 9 * SIZE(BO)
461 LFD f25, 10 * SIZE(BO)
462 LFD f26, 11 * SIZE(BO)
463 LFD f27, 12 * SIZE(BO)
465 FMADD f0, f16, f28, f0
466 FMADD f1, f16, f29, f1
467 FMADD f2, f17, f28, f2
468 FMADD f3, f17, f29, f3
470 FMADD f4, f18, f28, f4
471 FMADD f5, f18, f29, f5
472 FMADD f6, f19, f28, f6
473 FMADD f7, f19, f29, f7
475 LFD f16, 6 * SIZE(AO1)
476 LFD f17, 7 * SIZE(AO1)
477 LFD f18, 6 * SIZE(AO2)
478 LFD f19, 7 * SIZE(AO2)
480 FMADD f8, f20, f28, f8
481 FMADD f9, f20, f29, f9
482 FMADD f10, f21, f28, f10
483 FMADD f11, f21, f29, f11
485 FMADD f12, f22, f28, f12
486 FMADD f13, f22, f29, f13
487 FMADD f14, f23, f28, f14
488 FMADD f15, f23, f29, f15
490 LFD f20, 6 * SIZE(AO3)
491 LFD f21, 7 * SIZE(AO3)
492 LFD f22, 6 * SIZE(AO4)
493 LFD f23, 7 * SIZE(AO4)
495 FMADD f0, f16, f30, f0
496 FMADD f1, f16, f31, f1
497 FMADD f2, f17, f30, f2
498 FMADD f3, f17, f31, f3
500 FMADD f4, f18, f30, f4
501 FMADD f5, f18, f31, f5
502 FMADD f6, f19, f30, f6
503 FMADD f7, f19, f31, f7
505 LFD f16, 8 * SIZE(AO1)
506 LFD f17, 9 * SIZE(AO1)
507 LFD f18, 8 * SIZE(AO2)
508 LFD f19, 9 * SIZE(AO2)
510 FMADD f8, f20, f30, f8
511 FMADD f9, f20, f31, f9
512 FMADD f10, f21, f30, f10
513 FMADD f11, f21, f31, f11
515 FMADD f12, f22, f30, f12
516 FMADD f13, f22, f31, f13
517 FMADD f14, f23, f30, f14
518 FMADD f15, f23, f31, f15
520 LFD f20, 8 * SIZE(AO3)
521 LFD f21, 9 * SIZE(AO3)
522 LFD f22, 8 * SIZE(AO4)
523 LFD f23, 9 * SIZE(AO4)
525 LFD f28, 13 * SIZE(BO)
526 LFD f29, 14 * SIZE(BO)
527 LFD f30, 15 * SIZE(BO)
528 LFD f31, 16 * SIZE(BO)
530 FMADD f0, f16, f24, f0
531 FMADD f1, f16, f25, f1
532 FMADD f2, f17, f24, f2
533 FMADD f3, f17, f25, f3
535 FMADD f4, f18, f24, f4
536 FMADD f5, f18, f25, f5
537 FMADD f6, f19, f24, f6
538 FMADD f7, f19, f25, f7
540 LFD f16, 10 * SIZE(AO1)
541 LFD f17, 11 * SIZE(AO1)
542 LFD f18, 10 * SIZE(AO2)
543 LFD f19, 11 * SIZE(AO2)
545 FMADD f8, f20, f24, f8
546 FMADD f9, f20, f25, f9
547 FMADD f10, f21, f24, f10
548 FMADD f11, f21, f25, f11
550 FMADD f12, f22, f24, f12
551 FMADD f13, f22, f25, f13
552 FMADD f14, f23, f24, f14
553 FMADD f15, f23, f25, f15
555 LFD f20, 10 * SIZE(AO3)
556 LFD f21, 11 * SIZE(AO3)
557 LFD f22, 10 * SIZE(AO4)
558 LFD f23, 11 * SIZE(AO4)
560 FMADD f0, f16, f26, f0
561 FMADD f1, f16, f27, f1
562 FMADD f2, f17, f26, f2
563 FMADD f3, f17, f27, f3
565 FMADD f4, f18, f26, f4
566 FMADD f5, f18, f27, f5
567 FMADD f6, f19, f26, f6
568 FMADD f7, f19, f27, f7
570 LFD f16, 12 * SIZE(AO1)
571 LFD f17, 13 * SIZE(AO1)
572 LFD f18, 12 * SIZE(AO2)
573 LFD f19, 13 * SIZE(AO2)
575 FMADD f8, f20, f26, f8
576 FMADD f9, f20, f27, f9
577 FMADD f10, f21, f26, f10
578 FMADD f11, f21, f27, f11
580 FMADD f12, f22, f26, f12
581 FMADD f13, f22, f27, f13
582 FMADD f14, f23, f26, f14
583 FMADD f15, f23, f27, f15
585 LFD f20, 12 * SIZE(AO3)
586 LFD f21, 13 * SIZE(AO3)
587 LFD f22, 12 * SIZE(AO4)
588 LFD f23, 13 * SIZE(AO4)
590 LFD f24, 17 * SIZE(BO)
591 LFD f25, 18 * SIZE(BO)
592 LFD f26, 19 * SIZE(BO)
593 LFD f27, 20 * SIZE(BO)
595 FMADD f0, f16, f28, f0
596 FMADD f1, f16, f29, f1
597 FMADD f2, f17, f28, f2
598 FMADD f3, f17, f29, f3
600 FMADD f4, f18, f28, f4
601 FMADD f5, f18, f29, f5
602 FMADD f6, f19, f28, f6
603 FMADD f7, f19, f29, f7
605 LFD f16, 14 * SIZE(AO1)
606 LFD f17, 15 * SIZE(AO1)
607 LFD f18, 14 * SIZE(AO2)
608 LFD f19, 15 * SIZE(AO2)
610 FMADD f8, f20, f28, f8
611 FMADD f9, f20, f29, f9
612 FMADD f10, f21, f28, f10
613 FMADD f11, f21, f29, f11
615 FMADD f12, f22, f28, f12
616 FMADD f13, f22, f29, f13
617 FMADD f14, f23, f28, f14
618 FMADD f15, f23, f29, f15
620 LFD f20, 14 * SIZE(AO3)
621 LFD f21, 15 * SIZE(AO3)
622 LFD f22, 14 * SIZE(AO4)
623 LFD f23, 15 * SIZE(AO4)
625 FMADD f0, f16, f30, f0
626 FMADD f1, f16, f31, f1
627 FMADD f2, f17, f30, f2
628 FMADD f3, f17, f31, f3
630 FMADD f4, f18, f30, f4
631 FMADD f5, f18, f31, f5
632 FMADD f6, f19, f30, f6
633 FMADD f7, f19, f31, f7
635 LFD f16, 16 * SIZE(AO1)
636 LFD f17, 17 * SIZE(AO1)
637 LFD f18, 16 * SIZE(AO2)
638 LFD f19, 17 * SIZE(AO2)
640 addi AO1, AO1, 16 * SIZE
641 addi AO2, AO2, 16 * SIZE
645 FMADD f8, f20, f30, f8
646 FMADD f9, f20, f31, f9
647 FMADD f10, f21, f30, f10
648 FMADD f11, f21, f31, f11
650 FMADD f12, f22, f30, f12
651 FMADD f13, f22, f31, f13
652 FMADD f14, f23, f30, f14
653 FMADD f15, f23, f31, f15
655 LFD f20, 16 * SIZE(AO3)
656 LFD f21, 17 * SIZE(AO3)
657 LFD f22, 16 * SIZE(AO4)
658 LFD f23, 17 * SIZE(AO4)
660 LFD f28, 21 * SIZE(BO)
661 LFD f29, 22 * SIZE(BO)
662 LFD f30, 23 * SIZE(BO)
663 LFD f31, 24 * SIZE(BO)
665 addi AO3, AO3, 16 * SIZE
666 addi AO4, AO4, 16 * SIZE
670 addi BO, BO, 16 * SIZE
675 FMADD f0, f16, f24, f0
676 FMADD f1, f16, f25, f1
677 FMADD f2, f17, f24, f2
678 FMADD f3, f17, f25, f3
680 FMADD f4, f18, f24, f4
681 FMADD f5, f18, f25, f5
682 FMADD f6, f19, f24, f6
683 FMADD f7, f19, f25, f7
685 LFD f16, 2 * SIZE(AO1)
686 LFD f17, 3 * SIZE(AO1)
687 LFD f18, 2 * SIZE(AO2)
688 LFD f19, 3 * SIZE(AO2)
690 FMADD f8, f20, f24, f8
691 FMADD f9, f20, f25, f9
692 FMADD f10, f21, f24, f10
693 FMADD f11, f21, f25, f11
695 FMADD f12, f22, f24, f12
696 FMADD f13, f22, f25, f13
697 FMADD f14, f23, f24, f14
698 FMADD f15, f23, f25, f15
700 LFD f20, 2 * SIZE(AO3)
701 LFD f21, 3 * SIZE(AO3)
702 LFD f22, 2 * SIZE(AO4)
703 LFD f23, 3 * SIZE(AO4)
705 FMADD f0, f16, f26, f0
706 FMADD f1, f16, f27, f1
707 FMADD f2, f17, f26, f2
708 FMADD f3, f17, f27, f3
710 FMADD f4, f18, f26, f4
711 FMADD f5, f18, f27, f5
712 FMADD f6, f19, f26, f6
713 FMADD f7, f19, f27, f7
715 LFD f16, 4 * SIZE(AO1)
716 LFD f17, 5 * SIZE(AO1)
717 LFD f18, 4 * SIZE(AO2)
718 LFD f19, 5 * SIZE(AO2)
720 FMADD f8, f20, f26, f8
721 FMADD f9, f20, f27, f9
722 FMADD f10, f21, f26, f10
723 FMADD f11, f21, f27, f11
725 FMADD f12, f22, f26, f12
726 FMADD f13, f22, f27, f13
727 FMADD f14, f23, f26, f14
728 FMADD f15, f23, f27, f15
730 LFD f20, 4 * SIZE(AO3)
731 LFD f21, 5 * SIZE(AO3)
732 LFD f22, 4 * SIZE(AO4)
733 LFD f23, 5 * SIZE(AO4)
735 FMADD f0, f16, f28, f0
736 FMADD f1, f16, f29, f1
737 FMADD f2, f17, f28, f2
738 FMADD f3, f17, f29, f3
740 FMADD f4, f18, f28, f4
741 FMADD f5, f18, f29, f5
742 FMADD f6, f19, f28, f6
743 FMADD f7, f19, f29, f7
745 LFD f16, 6 * SIZE(AO1)
746 LFD f17, 7 * SIZE(AO1)
747 LFD f18, 6 * SIZE(AO2)
748 LFD f19, 7 * SIZE(AO2)
750 FMADD f8, f20, f28, f8
751 FMADD f9, f20, f29, f9
752 FMADD f10, f21, f28, f10
753 FMADD f11, f21, f29, f11
755 FMADD f12, f22, f28, f12
756 FMADD f13, f22, f29, f13
757 FMADD f14, f23, f28, f14
758 FMADD f15, f23, f29, f15
760 LFD f20, 6 * SIZE(AO3)
761 LFD f21, 7 * SIZE(AO3)
762 LFD f22, 6 * SIZE(AO4)
763 LFD f23, 7 * SIZE(AO4)
765 FMADD f0, f16, f30, f0
766 FMADD f1, f16, f31, f1
767 FMADD f2, f17, f30, f2
768 FMADD f3, f17, f31, f3
770 FMADD f4, f18, f30, f4
771 FMADD f5, f18, f31, f5
772 FMADD f6, f19, f30, f6
773 FMADD f7, f19, f31, f7
775 LFD f16, 8 * SIZE(AO1)
776 LFD f17, 9 * SIZE(AO1)
777 LFD f18, 8 * SIZE(AO2)
778 LFD f19, 9 * SIZE(AO2)
780 FMADD f8, f20, f30, f8
781 FMADD f9, f20, f31, f9
782 FMADD f10, f21, f30, f10
783 FMADD f11, f21, f31, f11
785 FMADD f12, f22, f30, f12
786 FMADD f13, f22, f31, f13
787 FMADD f14, f23, f30, f14
788 FMADD f15, f23, f31, f15
790 LFD f20, 8 * SIZE(AO3)
791 LFD f21, 9 * SIZE(AO3)
792 LFD f22, 8 * SIZE(AO4)
793 LFD f23, 9 * SIZE(AO4)
795 LFD f24, 9 * SIZE(BO)
796 LFD f25, 10 * SIZE(BO)
797 LFD f26, 11 * SIZE(BO)
798 LFD f27, 12 * SIZE(BO)
800 LFD f28, 13 * SIZE(BO)
801 LFD f29, 14 * SIZE(BO)
802 LFD f30, 15 * SIZE(BO)
803 LFDU f31, 16 * SIZE(BO)
805 FMADD f0, f16, f24, f0
806 FMADD f1, f16, f25, f1
807 FMADD f2, f17, f24, f2
808 FMADD f3, f17, f25, f3
810 FMADD f4, f18, f24, f4
811 FMADD f5, f18, f25, f5
812 FMADD f6, f19, f24, f6
813 FMADD f7, f19, f25, f7
815 LFD f16, 10 * SIZE(AO1)
816 LFD f17, 11 * SIZE(AO1)
817 LFD f18, 10 * SIZE(AO2)
818 LFD f19, 11 * SIZE(AO2)
820 FMADD f8, f20, f24, f8
821 FMADD f9, f20, f25, f9
822 FMADD f10, f21, f24, f10
823 FMADD f11, f21, f25, f11
825 FMADD f12, f22, f24, f12
826 FMADD f13, f22, f25, f13
827 FMADD f14, f23, f24, f14
828 FMADD f15, f23, f25, f15
830 LFD f20, 10 * SIZE(AO3)
831 LFD f21, 11 * SIZE(AO3)
832 LFD f22, 10 * SIZE(AO4)
833 LFD f23, 11 * SIZE(AO4)
835 FMADD f0, f16, f26, f0
836 FMADD f1, f16, f27, f1
837 FMADD f2, f17, f26, f2
838 FMADD f3, f17, f27, f3
840 FMADD f4, f18, f26, f4
841 FMADD f5, f18, f27, f5
842 FMADD f6, f19, f26, f6
843 FMADD f7, f19, f27, f7
845 LFD f16, 12 * SIZE(AO1)
846 LFD f17, 13 * SIZE(AO1)
847 LFD f18, 12 * SIZE(AO2)
848 LFD f19, 13 * SIZE(AO2)
850 FMADD f8, f20, f26, f8
851 FMADD f9, f20, f27, f9
852 FMADD f10, f21, f26, f10
853 FMADD f11, f21, f27, f11
855 FMADD f12, f22, f26, f12
856 FMADD f13, f22, f27, f13
857 FMADD f14, f23, f26, f14
858 FMADD f15, f23, f27, f15
860 LFD f20, 12 * SIZE(AO3)
861 LFD f21, 13 * SIZE(AO3)
862 LFD f22, 12 * SIZE(AO4)
863 LFD f23, 13 * SIZE(AO4)
865 FMADD f0, f16, f28, f0
866 FMADD f1, f16, f29, f1
867 FMADD f2, f17, f28, f2
868 FMADD f3, f17, f29, f3
870 FMADD f4, f18, f28, f4
871 FMADD f5, f18, f29, f5
872 FMADD f6, f19, f28, f6
873 FMADD f7, f19, f29, f7
875 LFD f16, 14 * SIZE(AO1)
876 LFD f17, 15 * SIZE(AO1)
877 LFD f18, 14 * SIZE(AO2)
878 LFD f19, 15 * SIZE(AO2)
880 FMADD f8, f20, f28, f8
881 FMADD f9, f20, f29, f9
882 FMADD f10, f21, f28, f10
883 FMADD f11, f21, f29, f11
885 FMADD f12, f22, f28, f12
886 FMADD f13, f22, f29, f13
887 FMADD f14, f23, f28, f14
888 FMADD f15, f23, f29, f15
890 LFD f20, 14 * SIZE(AO3)
891 LFD f21, 15 * SIZE(AO3)
892 LFD f22, 14 * SIZE(AO4)
893 LFD f23, 15 * SIZE(AO4)
895 addi AO1, AO1, 16 * SIZE
896 addi AO2, AO2, 16 * SIZE
897 addi AO3, AO3, 16 * SIZE
898 addi AO4, AO4, 16 * SIZE
900 FMADD f0, f16, f30, f0
901 FMADD f1, f16, f31, f1
902 FMADD f2, f17, f30, f2
903 FMADD f3, f17, f31, f3
905 FMADD f4, f18, f30, f4
906 FMADD f5, f18, f31, f5
907 FMADD f6, f19, f30, f6
908 FMADD f7, f19, f31, f7
910 FMADD f8, f20, f30, f8
911 FMADD f9, f20, f31, f9
912 FMADD f10, f21, f30, f10
913 FMADD f11, f21, f31, f11
915 FMADD f12, f22, f30, f12
916 FMADD f13, f22, f31, f13
917 FMADD f14, f23, f30, f14
918 FMADD f15, f23, f31, f15
927 LFD f16, 0 * SIZE(AO1)
928 LFD f17, 1 * SIZE(AO1)
929 LFD f18, 0 * SIZE(AO2)
930 LFD f19, 1 * SIZE(AO2)
931 LFD f20, 0 * SIZE(AO3)
932 LFD f21, 1 * SIZE(AO3)
933 LFD f22, 0 * SIZE(AO4)
934 LFD f23, 1 * SIZE(AO4)
936 LFD f24, 1 * SIZE(BO)
937 LFDU f25, 2 * SIZE(BO)
939 addi AO1, AO1, 2 * SIZE
940 addi AO2, AO2, 2 * SIZE
941 addi AO3, AO3, 2 * SIZE
942 addi AO4, AO4, 2 * SIZE
944 bdz LL(MainN3KernelSkip)
948 FMADD f0, f16, f24, f0
949 FMADD f1, f16, f25, f1
950 FMADD f2, f17, f24, f2
951 FMADD f3, f17, f25, f3
953 FMADD f4, f18, f24, f4
954 FMADD f5, f18, f25, f5
955 FMADD f6, f19, f24, f6
956 FMADD f7, f19, f25, f7
958 LFD f16, 0 * SIZE(AO1)
959 LFD f17, 1 * SIZE(AO1)
960 LFD f18, 0 * SIZE(AO2)
961 LFD f19, 1 * SIZE(AO2)
963 FMADD f8, f20, f24, f8
964 FMADD f9, f20, f25, f9
965 FMADD f10, f21, f24, f10
966 FMADD f11, f21, f25, f11
968 FMADD f12, f22, f24, f12
969 FMADD f13, f22, f25, f13
970 FMADD f14, f23, f24, f14
971 FMADD f15, f23, f25, f15
973 LFD f20, 0 * SIZE(AO3)
974 LFD f21, 1 * SIZE(AO3)
975 LFD f22, 0 * SIZE(AO4)
976 LFD f23, 1 * SIZE(AO4)
978 LFD f24, 1 * SIZE(BO)
979 LFDU f25, 2 * SIZE(BO)
981 addi AO1, AO1, 2 * SIZE
982 addi AO2, AO2, 2 * SIZE
983 addi AO3, AO3, 2 * SIZE
984 addi AO4, AO4, 2 * SIZE
986 bdnz LL(MainN3Kernel)
989 LL(MainN3KernelSkip):
990 FMADD f0, f16, f24, f0
991 FMADD f1, f16, f25, f1
992 FMADD f2, f17, f24, f2
993 FMADD f3, f17, f25, f3
995 FMADD f4, f18, f24, f4
996 FMADD f5, f18, f25, f5
997 FMADD f6, f19, f24, f6
998 FMADD f7, f19, f25, f7
1000 FMADD f8, f20, f24, f8
1001 FMADD f9, f20, f25, f9
1002 FMADD f10, f21, f24, f10
1003 FMADD f11, f21, f25, f11
1005 FMADD f12, f22, f24, f12
1006 FMADD f13, f22, f25, f13
1007 FMADD f14, f23, f24, f14
1008 FMADD f15, f23, f25, f15
1058 cmpwi cr0, INCY, 2 * SIZE
1061 LFD f16, 0 * SIZE(CO)
1062 LFD f17, 1 * SIZE(CO)
1063 LFD f18, 2 * SIZE(CO)
1064 LFD f19, 3 * SIZE(CO)
1065 LFD f20, 4 * SIZE(CO)
1066 LFD f21, 5 * SIZE(CO)
1067 LFD f22, 6 * SIZE(CO)
1068 LFD f23, 7 * SIZE(CO)
1070 FMADD f16, f30, f0, f16
1071 FMADDR f17, f30, f1, f17
1072 FMADD f18, f30, f4, f18
1073 FMADDR f19, f30, f5, f19
1075 FMADD f20, f30, f8, f20
1076 FMADDR f21, f30, f9, f21
1077 FMADD f22, f30, f12, f22
1078 FMADDR f23, f30, f13, f23
1080 FMSUBR f16, f31, f1, f16
1081 FMADD f17, f31, f0, f17
1082 FMSUBR f18, f31, f5, f18
1083 FMADD f19, f31, f4, f19
1085 FMSUBR f20, f31, f9, f20
1086 FMADD f21, f31, f8, f21
1087 FMSUBR f22, f31, f13, f22
1088 FMADD f23, f31, f12, f23
1090 STFD f16, 0 * SIZE(CO)
1091 STFD f17, 1 * SIZE(CO)
1092 STFD f18, 2 * SIZE(CO)
1093 STFD f19, 3 * SIZE(CO)
1095 STFD f20, 4 * SIZE(CO)
1096 STFD f21, 5 * SIZE(CO)
1097 STFD f22, 6 * SIZE(CO)
1098 STFD f23, 7 * SIZE(CO)
1100 addi CO, CO, 8 * SIZE
1109 LFD f16, 0 * SIZE(CO)
1110 LFD f17, 1 * SIZE(CO)
1113 LFD f18, 0 * SIZE(CO)
1114 LFD f19, 1 * SIZE(CO)
1117 LFD f20, 0 * SIZE(CO)
1118 LFD f21, 1 * SIZE(CO)
1121 LFD f22, 0 * SIZE(CO)
1122 LFD f23, 1 * SIZE(CO)
1125 FMADD f16, f30, f0, f16
1126 FMADDR f17, f30, f1, f17
1127 FMADD f18, f30, f4, f18
1128 FMADDR f19, f30, f5, f19
1130 FMADD f20, f30, f8, f20
1131 FMADDR f21, f30, f9, f21
1132 FMADD f22, f30, f12, f22
1133 FMADDR f23, f30, f13, f23
1135 FMSUBR f16, f31, f1, f16
1136 FMADD f17, f31, f0, f17
1137 FMSUBR f18, f31, f5, f18
1138 FMADD f19, f31, f4, f19
1140 FMSUBR f20, f31, f9, f20
1141 FMADD f21, f31, f8, f21
1142 FMSUBR f22, f31, f13, f22
1143 FMADD f23, f31, f12, f23
1145 STFD f16, 0 * SIZE(BO)
1146 STFD f17, 1 * SIZE(BO)
1148 STFD f18, 0 * SIZE(BO)
1149 STFD f19, 1 * SIZE(BO)
1152 STFD f20, 0 * SIZE(BO)
1153 STFD f21, 1 * SIZE(BO)
1155 STFD f22, 0 * SIZE(BO)
1156 STFD f23, 1 * SIZE(BO)
1190 srawi. r0 , MIN_N, 3
1194 LFD f16, 0 * SIZE(AO1)
1195 LFD f17, 1 * SIZE(AO1)
1196 LFD f18, 2 * SIZE(AO1)
1197 LFD f19, 3 * SIZE(AO1)
1199 LFD f20, 4 * SIZE(AO1)
1200 LFD f21, 5 * SIZE(AO1)
1201 LFD f22, 6 * SIZE(AO1)
1202 LFD f23, 7 * SIZE(AO1)
1204 LFD f24, 1 * SIZE(BO)
1205 LFD f25, 2 * SIZE(BO)
1206 LFD f26, 3 * SIZE(BO)
1207 LFD f27, 4 * SIZE(BO)
1209 LFD f28, 5 * SIZE(BO)
1210 LFD f29, 6 * SIZE(BO)
1211 LFD f30, 7 * SIZE(BO)
1212 LFD f31, 8 * SIZE(BO)
1214 bdz LL(RemainKernelSkip)
1218 FMADD f0, f16, f24, f0
1219 FMADD f1, f16, f25, f1
1220 FMADD f2, f17, f24, f2
1221 FMADD f3, f17, f25, f3
1223 FMADD f4, f18, f26, f4
1224 FMADD f5, f18, f27, f5
1225 FMADD f6, f19, f26, f6
1226 FMADD f7, f19, f27, f7
1228 LFD f16, 8 * SIZE(AO1)
1229 LFD f17, 9 * SIZE(AO1)
1230 LFD f18, 10 * SIZE(AO1)
1231 LFD f19, 11 * SIZE(AO1)
1233 LFD f24, 9 * SIZE(BO)
1234 LFD f25, 10 * SIZE(BO)
1235 LFD f26, 11 * SIZE(BO)
1236 LFD f27, 12 * SIZE(BO)
1238 FMADD f8, f20, f28, f8
1239 FMADD f9, f20, f29, f9
1240 FMADD f10, f21, f28, f10
1241 FMADD f11, f21, f29, f11
1243 FMADD f12, f22, f30, f12
1244 FMADD f13, f22, f31, f13
1245 FMADD f14, f23, f30, f14
1246 FMADD f15, f23, f31, f15
1248 LFD f20, 12 * SIZE(AO1)
1249 LFD f21, 13 * SIZE(AO1)
1250 LFD f22, 14 * SIZE(AO1)
1251 LFD f23, 15 * SIZE(AO1)
1253 LFD f28, 13 * SIZE(BO)
1254 LFD f29, 14 * SIZE(BO)
1255 LFD f30, 15 * SIZE(BO)
1256 LFD f31, 16 * SIZE(BO)
1258 FMADD f0, f16, f24, f0
1259 FMADD f1, f16, f25, f1
1260 FMADD f2, f17, f24, f2
1261 FMADD f3, f17, f25, f3
1263 FMADD f4, f18, f26, f4
1264 FMADD f5, f18, f27, f5
1265 FMADD f6, f19, f26, f6
1266 FMADD f7, f19, f27, f7
1268 LFD f16, 16 * SIZE(AO1)
1269 LFD f17, 17 * SIZE(AO1)
1270 LFD f18, 18 * SIZE(AO1)
1271 LFD f19, 19 * SIZE(AO1)
1273 LFD f24, 17 * SIZE(BO)
1274 LFD f25, 18 * SIZE(BO)
1275 LFD f26, 19 * SIZE(BO)
1276 LFD f27, 20 * SIZE(BO)
1278 FMADD f8, f20, f28, f8
1279 FMADD f9, f20, f29, f9
1280 FMADD f10, f21, f28, f10
1281 FMADD f11, f21, f29, f11
1283 FMADD f12, f22, f30, f12
1284 FMADD f13, f22, f31, f13
1285 FMADD f14, f23, f30, f14
1286 FMADD f15, f23, f31, f15
1288 LFD f20, 20 * SIZE(AO1)
1289 LFD f21, 21 * SIZE(AO1)
1290 LFD f22, 22 * SIZE(AO1)
1291 LFD f23, 23 * SIZE(AO1)
1293 LFD f28, 21 * SIZE(BO)
1294 LFD f29, 22 * SIZE(BO)
1295 LFD f30, 23 * SIZE(BO)
1296 LFD f31, 24 * SIZE(BO)
1298 addi AO1, AO1, 16 * SIZE
1299 addi BO, BO, 16 * SIZE
1303 bdnz LL(RemainKernel)
1306 LL(RemainKernelSkip):
1307 FMADD f0, f16, f24, f0
1308 FMADD f1, f16, f25, f1
1309 FMADD f2, f17, f24, f2
1310 FMADD f3, f17, f25, f3
1312 FMADD f4, f18, f26, f4
1313 FMADD f5, f18, f27, f5
1314 FMADD f6, f19, f26, f6
1315 FMADD f7, f19, f27, f7
1317 LFD f16, 8 * SIZE(AO1)
1318 LFD f17, 9 * SIZE(AO1)
1319 LFD f18, 10 * SIZE(AO1)
1320 LFD f19, 11 * SIZE(AO1)
1322 LFD f24, 9 * SIZE(BO)
1323 LFD f25, 10 * SIZE(BO)
1324 LFD f26, 11 * SIZE(BO)
1325 LFD f27, 12 * SIZE(BO)
1327 FMADD f8, f20, f28, f8
1328 FMADD f9, f20, f29, f9
1329 FMADD f10, f21, f28, f10
1330 FMADD f11, f21, f29, f11
1332 FMADD f12, f22, f30, f12
1333 FMADD f13, f22, f31, f13
1334 FMADD f14, f23, f30, f14
1335 FMADD f15, f23, f31, f15
1337 LFD f20, 12 * SIZE(AO1)
1338 LFD f21, 13 * SIZE(AO1)
1339 LFD f22, 14 * SIZE(AO1)
1340 LFD f23, 15 * SIZE(AO1)
1342 LFD f28, 13 * SIZE(BO)
1343 LFD f29, 14 * SIZE(BO)
1344 LFD f30, 15 * SIZE(BO)
1345 LFDU f31, 16 * SIZE(BO)
1347 FMADD f0, f16, f24, f0
1348 FMADD f1, f16, f25, f1
1349 FMADD f2, f17, f24, f2
1350 FMADD f3, f17, f25, f3
1352 FMADD f4, f18, f26, f4
1353 FMADD f5, f18, f27, f5
1354 FMADD f6, f19, f26, f6
1355 FMADD f7, f19, f27, f7
1357 FMADD f8, f20, f28, f8
1358 FMADD f9, f20, f29, f9
1359 FMADD f10, f21, f28, f10
1360 FMADD f11, f21, f29, f11
1362 FMADD f12, f22, f30, f12
1363 FMADD f13, f22, f31, f13
1364 FMADD f14, f23, f30, f14
1365 FMADD f15, f23, f31, f15
1367 addi AO1, AO1, 16 * SIZE
1373 ble LL(RemainFinish)
1376 LFD f16, 0 * SIZE(AO1)
1377 LFD f17, 1 * SIZE(AO1)
1378 LFD f24, 1 * SIZE(BO)
1379 LFDU f25, 2 * SIZE(BO)
1380 addi AO1, AO1, 2 * SIZE
1381 bdz LL(RemainN3KernelSkip)
1385 FMADD f0, f16, f24, f0
1386 FMADD f1, f16, f25, f1
1387 FMADD f2, f17, f24, f2
1388 FMADD f3, f17, f25, f3
1390 LFD f16, 0 * SIZE(AO1)
1391 LFD f17, 1 * SIZE(AO1)
1392 LFD f24, 1 * SIZE(BO)
1393 LFDU f25, 2 * SIZE(BO)
1394 addi AO1, AO1, 2 * SIZE
1395 bdnz LL(RemainN3Kernel)
1398 LL(RemainN3KernelSkip):
1399 FMADD f0, f16, f24, f0
1400 FMADD f1, f16, f25, f1
1401 FMADD f2, f17, f24, f2
1402 FMADD f3, f17, f25, f3
1408 LFD f16, 0 * SIZE(CO)
1409 LFD f17, 1 * SIZE(CO)
1444 FMADD f16, f30, f0, f16
1445 FMADDR f17, f30, f1, f17
1446 FMSUBR f16, f31, f1, f16
1447 FMADD f17, f31, f0, f17
1449 STFD f16, 0 * SIZE(CO)
1450 STFD f17, 1 * SIZE(CO)
1516 addi SP, SP, STACKSIZE