1 /***************************************************************************
2 Copyright (c) 2013-2020, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
28 #define MY_ALIGN .align 3
30 /* MINI SUBROUTINES */
31 /* 4x8 MAIN 128x+2 LOOP */
35 /*----------------------------------------*/
40 /*----------------------------------------*/
43 KERNEL4x8_L2 128,64,0,0
45 /*----------------------------------------*/
46 KERNEL4x8_L2 128,64,1,0
48 KERNEL4x8_L2 128,64,2,0
49 KERNEL4x8_L2 128,64,3,0
52 KERNEL4x8_L2 128,64,4,0
53 KERNEL4x8_L2 128,64,5,0
55 KERNEL4x8_L2 128,64,6,0
56 KERNEL4x8_L2 128,64,7,0
59 KERNEL4x8_L2 128,64,8,0
60 KERNEL4x8_L2 128,64,9,0
61 KERNEL4x8_L2 128,64,10,0
62 KERNEL4x8_L2 128,64,11,0
64 KERNEL4x8_L2 128,64,12,0
65 KERNEL4x8_L2 128,64,13,0
66 KERNEL4x8_L2 128,64,14,0
67 KERNEL4x8_L2 128,64,15,0
68 KERNEL4x8_L2 128,64,16,0
69 KERNEL4x8_L2 128,64,17,0
70 KERNEL4x8_L2 128,64,18,0
71 KERNEL4x8_L2 128,64,19,0
72 KERNEL4x8_L2 128,64,20,0
73 KERNEL4x8_L2 128,64,21,0
74 KERNEL4x8_L2 128,64,22,0
75 KERNEL4x8_L2 128,64,23,0
76 KERNEL4x8_L2 128,64,24,0
77 KERNEL4x8_L2 128,64,25,0
78 KERNEL4x8_L2 128,64,26,0
79 KERNEL4x8_L2 128,64,27,0
80 KERNEL4x8_L2 128,64,28,0
81 KERNEL4x8_L2 128,64,29,0
82 KERNEL4x8_L2 128,64,30,0
83 KERNEL4x8_L2 128,64,31,0
84 KERNEL4x8_L2 128,64,32,0
85 KERNEL4x8_L2 128,64,33,0
86 KERNEL4x8_L2 128,64,34,0
87 KERNEL4x8_L2 128,64,35,0
88 KERNEL4x8_L2 128,64,36,0
89 KERNEL4x8_L2 128,64,37,0
90 KERNEL4x8_L2 128,64,38,0
91 KERNEL4x8_L2 128,64,39,0
92 KERNEL4x8_L2 128,64,40,0
93 KERNEL4x8_L2 128,64,41,0
94 KERNEL4x8_L2 128,64,42,0
95 KERNEL4x8_L2 128,64,43,0
96 KERNEL4x8_L2 128,64,44,0
97 KERNEL4x8_L2 128,64,45,0
98 KERNEL4x8_L2 128,64,46,0
99 KERNEL4x8_L2 128,64,47,0
100 KERNEL4x8_L2 128,64,48,0
101 KERNEL4x8_L2 128,64,49,0
102 KERNEL4x8_L2 128,64,50,0
103 KERNEL4x8_L2 128,64,51,0
104 KERNEL4x8_L2 128,64,52,0
105 KERNEL4x8_L2 128,64,53,0
106 KERNEL4x8_L2 128,64,54,0
107 KERNEL4x8_L2 128,64,55,0
108 KERNEL4x8_L2 128,64,56,0
109 KERNEL4x8_L2 128,64,57,0
110 KERNEL4x8_L2 128,64,58,0
111 KERNEL4x8_L2 128,64,59,0
112 KERNEL4x8_L2 128,64,60,0
113 KERNEL4x8_L2 128,64,61,0
114 KERNEL4x8_L2 128,64,62,0
115 KERNEL4x8_L2 128,64,63,1
119 /*----------------------------------------*/
126 /*----------------------------------------*/
130 KERNEL4x8_L2 128,64,0,0
131 KERNEL4x8_L2 128,64,1,0
133 KERNEL4x8_L2 128,64,2,0
134 KERNEL4x8_L2 128,64,3,0
137 KERNEL4x8_L2 128,64,4,0
138 KERNEL4x8_L2 128,64,5,0
140 KERNEL4x8_L2 128,64,6,0
141 KERNEL4x8_L2 128,64,7,0
144 KERNEL4x8_L2 128,64,8,0
145 KERNEL4x8_L2 128,64,9,0
146 KERNEL4x8_L2 128,64,10,0
147 KERNEL4x8_L2 128,64,11,0
149 KERNEL4x8_L2 128,64,12,0
150 KERNEL4x8_L2 128,64,13,0
151 KERNEL4x8_L2 128,64,14,0
152 KERNEL4x8_L2 128,64,15,0
153 KERNEL4x8_L2 128,64,16,0
154 KERNEL4x8_L2 128,64,17,0
155 KERNEL4x8_L2 128,64,18,0
156 KERNEL4x8_L2 128,64,19,0
157 KERNEL4x8_L2 128,64,20,0
158 KERNEL4x8_L2 128,64,21,0
159 KERNEL4x8_L2 128,64,22,0
160 KERNEL4x8_L2 128,64,23,0
161 KERNEL4x8_L2 128,64,24,0
162 KERNEL4x8_L2 128,64,25,0
163 KERNEL4x8_L2 128,64,26,0
164 KERNEL4x8_L2 128,64,27,0
165 KERNEL4x8_L2 128,64,28,0
166 KERNEL4x8_L2 128,64,29,0
167 KERNEL4x8_L2 128,64,30,0
168 KERNEL4x8_E2 128,64,31,1
174 /*----------------------------------------*/
178 KERNEL4x8_L2 128,64,0,0
179 KERNEL4x8_L2 128,64,1,0
181 KERNEL4x8_L2 128,64,2,0
182 KERNEL4x8_L2 128,64,3,0
185 KERNEL4x8_L2 128,64,4,0
186 KERNEL4x8_L2 128,64,5,0
188 KERNEL4x8_L2 128,64,6,0
189 KERNEL4x8_L2 128,64,7,0
192 KERNEL4x8_L2 128,64,8,0
193 KERNEL4x8_L2 128,64,9,0
194 KERNEL4x8_L2 128,64,10,0
195 KERNEL4x8_L2 128,64,11,0
197 KERNEL4x8_L2 128,64,12,0
198 KERNEL4x8_L2 128,64,13,0
199 KERNEL4x8_L2 128,64,14,0
200 KERNEL4x8_E2 128,64,15,1
206 /*----------------------------------------*/
210 KERNEL4x8_L2 128,64,0,0
211 KERNEL4x8_L2 128,64,1,0
213 KERNEL4x8_L2 128,64,2,0
214 KERNEL4x8_L2 128,64,3,0
217 KERNEL4x8_L2 128,64,4,0
218 KERNEL4x8_L2 128,64,5,0
220 KERNEL4x8_L2 128,64,6,0
221 KERNEL4x8_E2 128,64,7,1
227 /*----------------------------------------*/
232 /*----------------------------------------*/
233 KERNEL4x4_L2 64,64,0,0
235 /*----------------------------------------*/
236 KERNEL4x4_L2 64,64,1,0
237 KERNEL4x4_L2 64,64,2,0
238 KERNEL4x4_L2 64,64,3,0
239 KERNEL4x4_L2 64,64,4,0
240 KERNEL4x4_L2 64,64,5,0
241 KERNEL4x4_L2 64,64,6,0
242 KERNEL4x4_L2 64,64,7,0
243 KERNEL4x4_L2 64,64,8,0
244 KERNEL4x4_L2 64,64,9,0
245 KERNEL4x4_L2 64,64,10,0
246 KERNEL4x4_L2 64,64,11,0
247 KERNEL4x4_L2 64,64,12,0
248 KERNEL4x4_L2 64,64,13,0
249 KERNEL4x4_L2 64,64,14,0
250 KERNEL4x4_L2 64,64,15,1
254 /*----------------------------------------*/
261 /*----------------------------------------*/
263 KERNEL4x4_L2 64,64,0,0
264 KERNEL4x4_L2 64,64,1,0
265 KERNEL4x4_L2 64,64,2,0
266 KERNEL4x4_L2 64,64,3,0
267 KERNEL4x4_L2 64,64,4,0
268 KERNEL4x4_L2 64,64,5,0
269 KERNEL4x4_L2 64,64,6,0
270 KERNEL4x4_E2 64,64,7,1
276 /*----------------------------------------*/
278 KERNEL4x4_L2 64,64,0,0
279 KERNEL4x4_L2 64,64,1,0
280 KERNEL4x4_L2 64,64,2,0
281 KERNEL4x4_E2 64,64,3,1
286 /*----------------------------------------*/
291 /*----------------------------------------*/
292 KERNEL4x2_L2 32,64,0,0
294 /*----------------------------------------*/
295 KERNEL4x2_L2 32,64,1,0
296 KERNEL4x2_L2 32,64,2,0
297 KERNEL4x2_L2 32,64,3,0
298 KERNEL4x2_L2 32,64,4,0
299 KERNEL4x2_L2 32,64,5,0
300 KERNEL4x2_L2 32,64,6,0
301 KERNEL4x2_L2 32,64,7,0
302 KERNEL4x2_L2 32,64,8,0
303 KERNEL4x2_L2 32,64,9,0
304 KERNEL4x2_L2 32,64,10,0
305 KERNEL4x2_L2 32,64,11,0
306 KERNEL4x2_L2 32,64,12,0
307 KERNEL4x2_L2 32,64,13,0
308 KERNEL4x2_L2 32,64,14,0
309 KERNEL4x2_L2 32,64,15,1
315 /*----------------------------------------*/
320 /*----------------------------------------*/
322 KERNEL4x2_L2 32,64,0,0
323 KERNEL4x2_L2 32,64,1,0
324 KERNEL4x2_L2 32,64,2,0
325 KERNEL4x2_L2 32,64,3,0
326 KERNEL4x2_L2 32,64,4,0
327 KERNEL4x2_L2 32,64,5,0
328 KERNEL4x2_L2 32,64,6,0
329 KERNEL4x2_E2 32,64,7,1
333 /*----------------------------------------*/
335 KERNEL4x2_L2 32,64,0,0
336 KERNEL4x2_L2 32,64,1,0
337 KERNEL4x2_L2 32,64,2,0
338 KERNEL4x2_E2 32,64,3,1
343 /*----------------------------------------*/
348 /*----------------------------------------*/
349 KERNEL4x1_L2 16,64,0,0
351 /*----------------------------------------*/
352 KERNEL4x1_L2 16,64,1,0
353 KERNEL4x1_L2 16,64,2,0
354 KERNEL4x1_L2 16,64,3,0
355 KERNEL4x1_L2 16,64,4,0
356 KERNEL4x1_L2 16,64,5,0
357 KERNEL4x1_L2 16,64,6,0
358 KERNEL4x1_L2 16,64,7,0
359 KERNEL4x1_L2 16,64,8,0
360 KERNEL4x1_L2 16,64,9,0
361 KERNEL4x1_L2 16,64,10,0
362 KERNEL4x1_L2 16,64,11,0
363 KERNEL4x1_L2 16,64,12,0
364 KERNEL4x1_L2 16,64,13,0
365 KERNEL4x1_L2 16,64,14,0
366 KERNEL4x1_L2 16,64,15,1
370 /*----------------------------------------*/
376 /*----------------------------------------*/
378 KERNEL4x1_L2 16,64,0,0
379 KERNEL4x1_L2 16,64,1,0
380 KERNEL4x1_L2 16,64,2,0
381 KERNEL4x1_L2 16,64,3,0
382 KERNEL4x1_L2 16,64,4,0
383 KERNEL4x1_L2 16,64,5,0
384 KERNEL4x1_L2 16,64,6,0
385 KERNEL4x1_E2 16,64,7,1
391 /*----------------------------------------*/
393 KERNEL4x1_L2 16,64,0,0
394 KERNEL4x1_L2 16,64,1,0
395 KERNEL4x1_L2 16,64,2,0
396 KERNEL4x1_E2 16,64,3,1
401 /* MAIN LOOP BEGINS */
406 /*----------------------------------------*/
407 #if defined(TRMMKERNEL) && !defined(LEFT)
410 /* Pre set value in vs57 as 0xffff0000ffff0000 for masking */
413 xxsldwi vs57, vs56, vs57, 1
414 xxpermdi vs57, vs57, vs57, 3
420 /*----------------------------------------*/
426 #if defined(TRMMKERNEL) && defined(LEFT)
427 mr TEMP_REG, OFFSET /*off = offset;*/
431 dcbt CO,r0 /*just prefetch*/
436 /*----------------------------------------*/
437 #if defined(TRMMKERNEL)
438 REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4
444 #if defined(TRMMKERNEL)
445 REFRESH_TEMP_BK T6,K,TEMP_REG,8,4
447 /* TEMPS FOR PREFETCH */
451 /* TEMPS FOR PREFETCH */
454 srawi. T8, T1, 7 /**(T1-2) % 128x */
457 /* TEMPS FOR PREFETCH */
461 /* TEMPS FOR PREFETCH */
464 srawi. T8, T1, 7 /**(K-2) % 128x */
468 bl CGEMM_L4x8_LMAIN_SUB
475 /*----------------------------------------*/
476 #if defined(TRMMKERNEL)
494 /*----------------------------------------*/
495 #if defined(TRMMKERNEL)
512 /*----------------------------------------*/
514 ble CGEMM_L4x8_SUB2_32
520 /*----------------------------------------*/
522 ble CGEMM_L4x8_SUB2_16
528 /*----------------------------------------*/
530 ble CGEMM_L4x8_SUB2_8
536 /*----------------------------------------*/
538 ble CGEMM_L4x8_SUB2_4
540 KERNEL4x8_L2 128,64, 0,0
541 KERNEL4x8_L2 128,64, 1,0
542 KERNEL4x8_L2 128,64, 2,0
543 KERNEL4x8_E2 128,64, 3,1
548 /*----------------------------------------*/
550 ble CGEMM_L4x8_SUB2_2
552 KERNEL4x8_L2 128,64, 0,0
553 KERNEL4x8_E2 128,64, 1,1
558 /*----------------------------------------*/
560 ble CGEMM_L4x8_SUB2_1
562 KERNEL4x8_E2 128,64, 0,1
567 /*----------------------------------------*/
574 /*----------------------------------------*/
578 #if defined(TRMMKERNEL)
579 REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4
591 /*----------------------------------------*/
595 /*----------------------------------------*/
600 #if defined(TRMMKERNEL)
601 REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4
605 #if defined(TRMMKERNEL)
606 REFRESH_TEMP_BK T6,K,TEMP_REG,4,4
609 srawi. T8, T1, 5 /**(T1-2) % 32x */
613 srawi. T8, T1, 5 /**(K-2) % 32x */
617 bl CGEMM_4x4_LMAIN_SUB
624 /*----------------------------------------*/
625 #if defined(TRMMKERNEL)
643 /*----------------------------------------*/
644 #if defined(TRMMKERNEL)
662 /*----------------------------------------*/
664 ble CGEMM_L4x4_SUB2_8
670 /*----------------------------------------*/
672 ble CGEMM_L4x4_SUB2_4
678 /*----------------------------------------*/
680 ble CGEMM_L4x4_SUB2_2
682 KERNEL4x4_L2 64,64, 0,0
683 KERNEL4x4_E2 64,64, 1,1
688 /*----------------------------------------*/
690 ble CGEMM_L4x4_SUB2_1
692 KERNEL4x4_E2 64,64, 0,1
697 /*----------------------------------------*/
704 /*----------------------------------------*/
706 #if defined(TRMMKERNEL)
707 REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4
712 /*----------------------------------------*/
716 /*----------------------------------------*/
719 #if defined(TRMMKERNEL)
720 REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4
724 #if defined(TRMMKERNEL)
725 REFRESH_TEMP_BK T6,K,TEMP_REG,2,4
728 srawi. T8, T1, 5 /**(T1-2) % 32x */
732 srawi. T8, T1, 5 /**(K-2) % 32x */
736 bl CGEMM_4x2_LMAIN_SUB
743 /*----------------------------------------*/
744 #if defined(TRMMKERNEL)
762 /*----------------------------------------*/
763 #if defined(TRMMKERNEL)
781 /*----------------------------------------*/
783 ble CGEMM_L4x2_SUB2_8
789 /*----------------------------------------*/
791 ble CGEMM_L4x2_SUB2_4
797 /*----------------------------------------*/
799 ble CGEMM_L4x2_SUB2_2
801 KERNEL4x2_L2 32,64, 0,0
802 KERNEL4x2_E2 32,64, 1,1
807 /*----------------------------------------*/
809 ble CGEMM_L4x2_SUB2_1
811 KERNEL4x2_E2 32,64, 0,1
816 /*----------------------------------------*/
823 /*----------------------------------------*/
825 #if defined(TRMMKERNEL)
826 REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4
831 /*----------------------------------------*/
835 /*----------------------------------------*/
838 #if defined(TRMMKERNEL)
839 REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4
843 #if defined(TRMMKERNEL)
844 REFRESH_TEMP_BK T6,K,TEMP_REG,1,4
847 srawi. T8, T1, 5 /**(T1-2) % 32x */
851 srawi. T8, T1, 5 /**(K-2) % 32x */
855 bl CGEMM_4x1_LMAIN_SUB
862 /*----------------------------------------*/
863 #if defined(TRMMKERNEL)
881 /*----------------------------------------*/
882 #if defined(TRMMKERNEL)
900 /*----------------------------------------*/
902 ble CGEMM_L4x1_SUB2_8
908 /*----------------------------------------*/
910 ble CGEMM_L4x1_SUB2_4
916 /*----------------------------------------*/
918 ble CGEMM_L4x1_SUB2_2
920 KERNEL4x1_L2 16,64, 0,0
921 KERNEL4x1_E2 16,64, 1,1
926 /*----------------------------------------*/
928 ble CGEMM_L4x1_SUB2_1
930 KERNEL4x1_E2 16,64, 0,1
935 /*----------------------------------------*/
942 /*----------------------------------------*/
945 #if defined(TRMMKERNEL)
946 REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4
951 /*----------------------------------------*/
955 #if defined(TRMMKERNEL) && !defined(LEFT)
956 addi TEMP_REG, TEMP_REG, 4
964 /* MINI SUBROUTINES */
965 /* 2x8 MAIN 128x+2 LOOP */
968 CGEMM_L2x8_LMAIN_SUB:
969 /*----------------------------------------*/
974 /*----------------------------------------*/
977 KERNEL2x8_L2 128,32,0,0
979 /*----------------------------------------*/
980 KERNEL2x8_L2 128,32,1,0
982 KERNEL2x8_L2 128,32,2,0
983 KERNEL2x8_L2 128,32,3,0
986 KERNEL2x8_L2 128,32,4,0
987 KERNEL2x8_L2 128,32,5,0
989 KERNEL2x8_L2 128,32,6,0
990 KERNEL2x8_L2 128,32,7,0
993 KERNEL2x8_L2 128,32,8,0
994 KERNEL2x8_L2 128,32,9,0
995 KERNEL2x8_L2 128,32,10,0
996 KERNEL2x8_L2 128,32,11,0
998 KERNEL2x8_L2 128,32,12,0
999 KERNEL2x8_L2 128,32,13,0
1000 KERNEL2x8_L2 128,32,14,0
1001 KERNEL2x8_L2 128,32,15,0
1002 KERNEL2x8_L2 128,32,16,0
1003 KERNEL2x8_L2 128,32,17,0
1004 KERNEL2x8_L2 128,32,18,0
1005 KERNEL2x8_L2 128,32,19,0
1006 KERNEL2x8_L2 128,32,20,0
1007 KERNEL2x8_L2 128,32,21,0
1008 KERNEL2x8_L2 128,32,22,0
1009 KERNEL2x8_L2 128,32,23,0
1010 KERNEL2x8_L2 128,32,24,0
1011 KERNEL2x8_L2 128,32,25,0
1012 KERNEL2x8_L2 128,32,26,0
1013 KERNEL2x8_L2 128,32,27,0
1014 KERNEL2x8_L2 128,32,28,0
1015 KERNEL2x8_L2 128,32,29,0
1016 KERNEL2x8_L2 128,32,30,0
1017 KERNEL2x8_L2 128,32,31,0
1018 KERNEL2x8_L2 128,32,32,0
1019 KERNEL2x8_L2 128,32,33,0
1020 KERNEL2x8_L2 128,32,34,0
1021 KERNEL2x8_L2 128,32,35,0
1022 KERNEL2x8_L2 128,32,36,0
1023 KERNEL2x8_L2 128,32,37,0
1024 KERNEL2x8_L2 128,32,38,0
1025 KERNEL2x8_L2 128,32,39,0
1026 KERNEL2x8_L2 128,32,40,0
1027 KERNEL2x8_L2 128,32,41,0
1028 KERNEL2x8_L2 128,32,42,0
1029 KERNEL2x8_L2 128,32,43,0
1030 KERNEL2x8_L2 128,32,44,0
1031 KERNEL2x8_L2 128,32,45,0
1032 KERNEL2x8_L2 128,32,46,0
1033 KERNEL2x8_L2 128,32,47,0
1034 KERNEL2x8_L2 128,32,48,0
1035 KERNEL2x8_L2 128,32,49,0
1036 KERNEL2x8_L2 128,32,50,0
1037 KERNEL2x8_L2 128,32,51,0
1038 KERNEL2x8_L2 128,32,52,0
1039 KERNEL2x8_L2 128,32,53,0
1040 KERNEL2x8_L2 128,32,54,0
1041 KERNEL2x8_L2 128,32,55,0
1042 KERNEL2x8_L2 128,32,56,0
1043 KERNEL2x8_L2 128,32,57,0
1044 KERNEL2x8_L2 128,32,58,0
1045 KERNEL2x8_L2 128,32,59,0
1046 KERNEL2x8_L2 128,32,60,0
1047 KERNEL2x8_L2 128,32,61,0
1048 KERNEL2x8_L2 128,32,62,0
1049 KERNEL2x8_L2 128,32,63,1
1050 bdnz CGEMM_L2x8_LOOP
1052 CGEMM_L2x8_LOOP_END:
1053 /*----------------------------------------*/
1060 /*----------------------------------------*/
1064 KERNEL2x8_L2 128,32,0,0
1065 KERNEL2x8_L2 128,32,1,0
1067 KERNEL2x8_L2 128,32,2,0
1068 KERNEL2x8_L2 128,32,3,0
1071 KERNEL2x8_L2 128,32,4,0
1072 KERNEL2x8_L2 128,32,5,0
1074 KERNEL2x8_L2 128,32,6,0
1075 KERNEL2x8_L2 128,32,7,0
1078 KERNEL2x8_L2 128,32,8,0
1079 KERNEL2x8_L2 128,32,9,0
1080 KERNEL2x8_L2 128,32,10,0
1081 KERNEL2x8_L2 128,32,11,0
1083 KERNEL2x8_L2 128,32,12,0
1084 KERNEL2x8_L2 128,32,13,0
1085 KERNEL2x8_L2 128,32,14,0
1086 KERNEL2x8_L2 128,32,15,0
1087 KERNEL2x8_L2 128,32,16,0
1088 KERNEL2x8_L2 128,32,17,0
1089 KERNEL2x8_L2 128,32,18,0
1090 KERNEL2x8_L2 128,32,19,0
1091 KERNEL2x8_L2 128,32,20,0
1092 KERNEL2x8_L2 128,32,21,0
1093 KERNEL2x8_L2 128,32,22,0
1094 KERNEL2x8_L2 128,32,23,0
1095 KERNEL2x8_L2 128,32,24,0
1096 KERNEL2x8_L2 128,32,25,0
1097 KERNEL2x8_L2 128,32,26,0
1098 KERNEL2x8_L2 128,32,27,0
1099 KERNEL2x8_L2 128,32,28,0
1100 KERNEL2x8_L2 128,32,29,0
1101 KERNEL2x8_L2 128,32,30,0
1102 KERNEL2x8_E2 128,32,31,1
1108 /*----------------------------------------*/
1112 KERNEL2x8_L2 128,32,0,0
1113 KERNEL2x8_L2 128,32,1,0
1115 KERNEL2x8_L2 128,32,2,0
1116 KERNEL2x8_L2 128,32,3,0
1119 KERNEL2x8_L2 128,32,4,0
1120 KERNEL2x8_L2 128,32,5,0
1122 KERNEL2x8_L2 128,32,6,0
1123 KERNEL2x8_L2 128,32,7,0
1126 KERNEL2x8_L2 128,32,8,0
1127 KERNEL2x8_L2 128,32,9,0
1128 KERNEL2x8_L2 128,32,10,0
1129 KERNEL2x8_L2 128,32,11,0
1131 KERNEL2x8_L2 128,32,12,0
1132 KERNEL2x8_L2 128,32,13,0
1133 KERNEL2x8_L2 128,32,14,0
1134 KERNEL2x8_E2 128,32,15,1
1140 /*----------------------------------------*/
1144 KERNEL2x8_L2 128,32,0,0
1145 KERNEL2x8_L2 128,32,1,0
1147 KERNEL2x8_L2 128,32,2,0
1148 KERNEL2x8_L2 128,32,3,0
1151 KERNEL2x8_L2 128,32,4,0
1152 KERNEL2x8_L2 128,32,5,0
1154 KERNEL2x8_L2 128,32,6,0
1155 KERNEL2x8_E2 128,32,7,1
1160 CGEMM_2x4_LMAIN_SUB:
1161 /*----------------------------------------*/
1166 /*----------------------------------------*/
1167 KERNEL2x4_L2 64,32,0,0
1169 /*----------------------------------------*/
1170 KERNEL2x4_L2 64,32,1,0
1171 KERNEL2x4_L2 64,32,2,0
1172 KERNEL2x4_L2 64,32,3,0
1173 KERNEL2x4_L2 64,32,4,0
1174 KERNEL2x4_L2 64,32,5,0
1175 KERNEL2x4_L2 64,32,6,0
1176 KERNEL2x4_L2 64,32,7,0
1177 KERNEL2x4_L2 64,32,8,0
1178 KERNEL2x4_L2 64,32,9,0
1179 KERNEL2x4_L2 64,32,10,0
1180 KERNEL2x4_L2 64,32,11,0
1181 KERNEL2x4_L2 64,32,12,0
1182 KERNEL2x4_L2 64,32,13,0
1183 KERNEL2x4_L2 64,32,14,0
1184 KERNEL2x4_L2 64,32,15,1
1185 bdnz CGEMM_L2x4_LOOP
1187 CGEMM_L2x4_LOOP_END:
1188 /*----------------------------------------*/
1195 /*----------------------------------------*/
1197 KERNEL2x4_L2 64,32,0,0
1198 KERNEL2x4_L2 64,32,1,0
1199 KERNEL2x4_L2 64,32,2,0
1200 KERNEL2x4_L2 64,32,3,0
1201 KERNEL2x4_L2 64,32,4,0
1202 KERNEL2x4_L2 64,32,5,0
1203 KERNEL2x4_L2 64,32,6,0
1204 KERNEL2x4_E2 64,32,7,1
1210 /*----------------------------------------*/
1212 KERNEL2x4_L2 64,32,0,0
1213 KERNEL2x4_L2 64,32,1,0
1214 KERNEL2x4_L2 64,32,2,0
1215 KERNEL2x4_E2 64,32,3,1
1219 CGEMM_2x2_LMAIN_SUB:
1220 /*----------------------------------------*/
1225 /*----------------------------------------*/
1226 KERNEL2x2_L2 32,32,0,0
1228 /*----------------------------------------*/
1229 KERNEL2x2_L2 32,32,1,0
1230 KERNEL2x2_L2 32,32,2,0
1231 KERNEL2x2_L2 32,32,3,0
1232 KERNEL2x2_L2 32,32,4,0
1233 KERNEL2x2_L2 32,32,5,0
1234 KERNEL2x2_L2 32,32,6,0
1235 KERNEL2x2_L2 32,32,7,0
1236 KERNEL2x2_L2 32,32,8,0
1237 KERNEL2x2_L2 32,32,9,0
1238 KERNEL2x2_L2 32,32,10,0
1239 KERNEL2x2_L2 32,32,11,0
1240 KERNEL2x2_L2 32,32,12,0
1241 KERNEL2x2_L2 32,32,13,0
1242 KERNEL2x2_L2 32,32,14,0
1243 KERNEL2x2_L2 32,32,15,1
1244 bdnz CGEMM_L2x2_LOOP
1248 CGEMM_L2x2_LOOP_END:
1249 /*----------------------------------------*/
1254 /*----------------------------------------*/
1256 KERNEL2x2_L2 32,32,0,0
1257 KERNEL2x2_L2 32,32,1,0
1258 KERNEL2x2_L2 32,32,2,0
1259 KERNEL2x2_L2 32,32,3,0
1260 KERNEL2x2_L2 32,32,4,0
1261 KERNEL2x2_L2 32,32,5,0
1262 KERNEL2x2_L2 32,32,6,0
1263 KERNEL2x2_E2 32,32,7,1
1267 /*----------------------------------------*/
1269 KERNEL2x2_L2 32,32,0,0
1270 KERNEL2x2_L2 32,32,1,0
1271 KERNEL2x2_L2 32,32,2,0
1272 KERNEL2x2_E2 32,32,3,1
1276 CGEMM_2x1_LMAIN_SUB:
1277 /*----------------------------------------*/
1282 /*----------------------------------------*/
1283 KERNEL2x1_L2 16,32,0,0
1285 /*----------------------------------------*/
1286 KERNEL2x1_L2 16,32,1,0
1287 KERNEL2x1_L2 16,32,2,0
1288 KERNEL2x1_L2 16,32,3,0
1289 KERNEL2x1_L2 16,32,4,0
1290 KERNEL2x1_L2 16,32,5,0
1291 KERNEL2x1_L2 16,32,6,0
1292 KERNEL2x1_L2 16,32,7,0
1293 KERNEL2x1_L2 16,32,8,0
1294 KERNEL2x1_L2 16,32,9,0
1295 KERNEL2x1_L2 16,32,10,0
1296 KERNEL2x1_L2 16,32,11,0
1297 KERNEL2x1_L2 16,32,12,0
1298 KERNEL2x1_L2 16,32,13,0
1299 KERNEL2x1_L2 16,32,14,0
1300 KERNEL2x1_L2 16,32,15,1
1301 bdnz CGEMM_L2x1_LOOP
1303 CGEMM_L2x1_LOOP_END:
1304 /*----------------------------------------*/
1310 /*----------------------------------------*/
1312 KERNEL2x1_L2 16,32,0,0
1313 KERNEL2x1_L2 16,32,1,0
1314 KERNEL2x1_L2 16,32,2,0
1315 KERNEL2x1_L2 16,32,3,0
1316 KERNEL2x1_L2 16,32,4,0
1317 KERNEL2x1_L2 16,32,5,0
1318 KERNEL2x1_L2 16,32,6,0
1319 KERNEL2x1_E2 16,32,7,1
1325 /*----------------------------------------*/
1327 KERNEL2x1_L2 16,32,0,0
1328 KERNEL2x1_L2 16,32,1,0
1329 KERNEL2x1_L2 16,32,2,0
1330 KERNEL2x1_E2 16,32,3,1
1335 /* MAIN LOOP BEGINS */
1340 /*----------------------------------------*/
1347 /*----------------------------------------*/
1353 #if defined(TRMMKERNEL) && defined(LEFT)
1354 mr TEMP_REG, OFFSET /*off = offset;*/
1358 dcbt CO,r0 /*just prefetch*/
1363 /*----------------------------------------*/
1364 #if defined(TRMMKERNEL)
1365 REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2
1371 #if defined(TRMMKERNEL)
1372 REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
1374 /* TEMPS FOR PREFETCH */
1378 /* TEMPS FOR PREFETCH */
1381 srawi. T8, T1, 7 /**(T1-2) % 128x */
1384 /* TEMPS FOR PREFETCH */
1388 /* TEMPS FOR PREFETCH */
1391 srawi. T8, T1, 7 /**(K-2) % 128x */
1395 bl CGEMM_L2x8_LMAIN_SUB
1402 /*----------------------------------------*/
1403 #if defined(TRMMKERNEL)
1421 /*----------------------------------------*/
1422 #if defined(TRMMKERNEL)
1439 /*----------------------------------------*/
1441 ble CGEMM_L2x8_SUB2_32
1442 bl CGEMM_2x8_L64_SUB
1447 /*----------------------------------------*/
1449 ble CGEMM_L2x8_SUB2_16
1450 bl CGEMM_2x8_L32_SUB
1455 /*----------------------------------------*/
1457 ble CGEMM_L2x8_SUB2_8
1458 bl CGEMM_2x8_L16_SUB
1463 /*----------------------------------------*/
1465 ble CGEMM_L2x8_SUB2_4
1467 KERNEL2x8_L2 128,32, 0,0
1468 KERNEL2x8_L2 128,32, 1,0
1469 KERNEL2x8_L2 128,32, 2,0
1470 KERNEL2x8_E2 128,32, 3,1
1475 /*----------------------------------------*/
1477 ble CGEMM_L2x8_SUB2_2
1479 KERNEL2x8_L2 128,32, 0,0
1480 KERNEL2x8_E2 128,32, 1,1
1485 /*----------------------------------------*/
1487 ble CGEMM_L2x8_SUB2_1
1489 KERNEL2x8_E2 128,32, 0,1
1494 /*----------------------------------------*/
1501 /*----------------------------------------*/
1505 #if defined(TRMMKERNEL)
1506 REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
1508 bgt CGEMM_L2x8_BEGIN
1518 /*----------------------------------------*/
1522 /*----------------------------------------*/
1527 #if defined(TRMMKERNEL)
1528 REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2
1532 #if defined(TRMMKERNEL)
1533 REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
1536 srawi. T8, T1, 5 /**(T1-2) % 32x */
1540 srawi. T8, T1, 5 /**(K-2) % 32x */
1544 bl CGEMM_2x4_LMAIN_SUB
1551 /*----------------------------------------*/
1552 #if defined(TRMMKERNEL)
1570 /*----------------------------------------*/
1571 #if defined(TRMMKERNEL)
1589 /*----------------------------------------*/
1591 ble CGEMM_L2x4_SUB2_8
1592 bl CGEMM_2x4_L16_SUB
1597 /*----------------------------------------*/
1599 ble CGEMM_L2x4_SUB2_4
1605 /*----------------------------------------*/
1607 ble CGEMM_L2x4_SUB2_2
1609 KERNEL2x4_L2 64,32, 0,0
1610 KERNEL2x4_E2 64,32, 1,1
1615 /*----------------------------------------*/
1617 ble CGEMM_L2x4_SUB2_1
1619 KERNEL2x4_E2 64,32, 0,1
1624 /*----------------------------------------*/
1631 /*----------------------------------------*/
1633 #if defined(TRMMKERNEL)
1634 REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
1639 /*----------------------------------------*/
1643 /*----------------------------------------*/
1646 #if defined(TRMMKERNEL)
1647 REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2
1651 #if defined(TRMMKERNEL)
1652 REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
1655 srawi. T8, T1, 5 /**(T1-2) % 32x */
1659 srawi. T8, T1, 5 /**(K-2) % 32x */
1663 bl CGEMM_2x2_LMAIN_SUB
1670 /*----------------------------------------*/
1671 #if defined(TRMMKERNEL)
1689 /*----------------------------------------*/
1690 #if defined(TRMMKERNEL)
1708 /*----------------------------------------*/
1710 ble CGEMM_L2x2_SUB2_8
1711 bl CGEMM_2x2_L16_SUB
1716 /*----------------------------------------*/
1718 ble CGEMM_L2x2_SUB2_4
1724 /*----------------------------------------*/
1726 ble CGEMM_L2x2_SUB2_2
1728 KERNEL2x2_L2 32,32, 0,0
1729 KERNEL2x2_E2 32,32, 1,1
1734 /*----------------------------------------*/
1736 ble CGEMM_L2x2_SUB2_1
1738 KERNEL2x2_E2 32,32, 0,1
1743 /*----------------------------------------*/
1750 /*----------------------------------------*/
1752 #if defined(TRMMKERNEL)
1753 REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
1758 /*----------------------------------------*/
1762 /*----------------------------------------*/
1765 #if defined(TRMMKERNEL)
1766 REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2
1770 #if defined(TRMMKERNEL)
1771 REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
1774 srawi. T8, T1, 5 /**(T1-2) % 32x */
1778 srawi. T8, T1, 5 /**(K-2) % 32x */
1782 bl CGEMM_2x1_LMAIN_SUB
1789 /*----------------------------------------*/
1790 #if defined(TRMMKERNEL)
1808 /*----------------------------------------*/
1809 #if defined(TRMMKERNEL)
1827 /*----------------------------------------*/
1829 ble CGEMM_L2x1_SUB2_8
1830 bl CGEMM_2x1_L16_SUB
1835 /*----------------------------------------*/
1837 ble CGEMM_L2x1_SUB2_4
1843 /*----------------------------------------*/
1845 ble CGEMM_L2x1_SUB2_2
1847 KERNEL2x1_L2 16,32, 0,0
1848 KERNEL2x1_E2 16,32, 1,1
1853 /*----------------------------------------*/
1855 ble CGEMM_L2x1_SUB2_1
1857 KERNEL2x1_E2 16,32, 0,1
1862 /*----------------------------------------*/
1869 /*----------------------------------------*/
1872 #if defined(TRMMKERNEL)
1873 REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
1878 /*----------------------------------------*/
1882 #if defined(TRMMKERNEL) && !defined(LEFT)
1883 addi TEMP_REG, TEMP_REG, 2
1890 /* MINI SUBROUTINES */
1891 /* 1x8 MAIN 128x+2 LOOP */
1894 CGEMM_L1x8_LMAIN_SUB:
1895 /*----------------------------------------*/
1900 /*----------------------------------------*/
1903 KERNEL1x8_L2 128,16,0,0
1905 /*----------------------------------------*/
1906 KERNEL1x8_L2 128,16,1,0
1908 KERNEL1x8_L2 128,16,2,0
1909 KERNEL1x8_L2 128,16,3,0
1912 KERNEL1x8_L2 128,16,4,0
1913 KERNEL1x8_L2 128,16,5,0
1915 KERNEL1x8_L2 128,16,6,0
1916 KERNEL1x8_L2 128,16,7,0
1919 KERNEL1x8_L2 128,16,8,0
1920 KERNEL1x8_L2 128,16,9,0
1921 KERNEL1x8_L2 128,16,10,0
1922 KERNEL1x8_L2 128,16,11,0
1924 KERNEL1x8_L2 128,16,12,0
1925 KERNEL1x8_L2 128,16,13,0
1926 KERNEL1x8_L2 128,16,14,0
1927 KERNEL1x8_L2 128,16,15,0
1928 KERNEL1x8_L2 128,16,16,0
1929 KERNEL1x8_L2 128,16,17,0
1930 KERNEL1x8_L2 128,16,18,0
1931 KERNEL1x8_L2 128,16,19,0
1932 KERNEL1x8_L2 128,16,20,0
1933 KERNEL1x8_L2 128,16,21,0
1934 KERNEL1x8_L2 128,16,22,0
1935 KERNEL1x8_L2 128,16,23,0
1936 KERNEL1x8_L2 128,16,24,0
1937 KERNEL1x8_L2 128,16,25,0
1938 KERNEL1x8_L2 128,16,26,0
1939 KERNEL1x8_L2 128,16,27,0
1940 KERNEL1x8_L2 128,16,28,0
1941 KERNEL1x8_L2 128,16,29,0
1942 KERNEL1x8_L2 128,16,30,0
1943 KERNEL1x8_L2 128,16,31,0
1944 KERNEL1x8_L2 128,16,32,0
1945 KERNEL1x8_L2 128,16,33,0
1946 KERNEL1x8_L2 128,16,34,0
1947 KERNEL1x8_L2 128,16,35,0
1948 KERNEL1x8_L2 128,16,36,0
1949 KERNEL1x8_L2 128,16,37,0
1950 KERNEL1x8_L2 128,16,38,0
1951 KERNEL1x8_L2 128,16,39,0
1952 KERNEL1x8_L2 128,16,40,0
1953 KERNEL1x8_L2 128,16,41,0
1954 KERNEL1x8_L2 128,16,42,0
1955 KERNEL1x8_L2 128,16,43,0
1956 KERNEL1x8_L2 128,16,44,0
1957 KERNEL1x8_L2 128,16,45,0
1958 KERNEL1x8_L2 128,16,46,0
1959 KERNEL1x8_L2 128,16,47,0
1960 KERNEL1x8_L2 128,16,48,0
1961 KERNEL1x8_L2 128,16,49,0
1962 KERNEL1x8_L2 128,16,50,0
1963 KERNEL1x8_L2 128,16,51,0
1964 KERNEL1x8_L2 128,16,52,0
1965 KERNEL1x8_L2 128,16,53,0
1966 KERNEL1x8_L2 128,16,54,0
1967 KERNEL1x8_L2 128,16,55,0
1968 KERNEL1x8_L2 128,16,56,0
1969 KERNEL1x8_L2 128,16,57,0
1970 KERNEL1x8_L2 128,16,58,0
1971 KERNEL1x8_L2 128,16,59,0
1972 KERNEL1x8_L2 128,16,60,0
1973 KERNEL1x8_L2 128,16,61,0
1974 KERNEL1x8_L2 128,16,62,0
1975 KERNEL1x8_L2 128,16,63,1
1976 bdnz CGEMM_L1x8_LOOP
1978 CGEMM_L1x8_LOOP_END:
1979 /*----------------------------------------*/
1986 /*----------------------------------------*/
1990 KERNEL1x8_L2 128,16,0,0
1991 KERNEL1x8_L2 128,16,1,0
1993 KERNEL1x8_L2 128,16,2,0
1994 KERNEL1x8_L2 128,16,3,0
1997 KERNEL1x8_L2 128,16,4,0
1998 KERNEL1x8_L2 128,16,5,0
2000 KERNEL1x8_L2 128,16,6,0
2001 KERNEL1x8_L2 128,16,7,0
2004 KERNEL1x8_L2 128,16,8,0
2005 KERNEL1x8_L2 128,16,9,0
2006 KERNEL1x8_L2 128,16,10,0
2007 KERNEL1x8_L2 128,16,11,0
2009 KERNEL1x8_L2 128,16,12,0
2010 KERNEL1x8_L2 128,16,13,0
2011 KERNEL1x8_L2 128,16,14,0
2012 KERNEL1x8_L2 128,16,15,0
2013 KERNEL1x8_L2 128,16,16,0
2014 KERNEL1x8_L2 128,16,17,0
2015 KERNEL1x8_L2 128,16,18,0
2016 KERNEL1x8_L2 128,16,19,0
2017 KERNEL1x8_L2 128,16,20,0
2018 KERNEL1x8_L2 128,16,21,0
2019 KERNEL1x8_L2 128,16,22,0
2020 KERNEL1x8_L2 128,16,23,0
2021 KERNEL1x8_L2 128,16,24,0
2022 KERNEL1x8_L2 128,16,25,0
2023 KERNEL1x8_L2 128,16,26,0
2024 KERNEL1x8_L2 128,16,27,0
2025 KERNEL1x8_L2 128,16,28,0
2026 KERNEL1x8_L2 128,16,29,0
2027 KERNEL1x8_L2 128,16,30,0
2028 KERNEL1x8_E2 128,16,31,1
2034 /*----------------------------------------*/
2038 KERNEL1x8_L2 128,16,0,0
2039 KERNEL1x8_L2 128,16,1,0
2041 KERNEL1x8_L2 128,16,2,0
2042 KERNEL1x8_L2 128,16,3,0
2045 KERNEL1x8_L2 128,16,4,0
2046 KERNEL1x8_L2 128,16,5,0
2048 KERNEL1x8_L2 128,16,6,0
2049 KERNEL1x8_L2 128,16,7,0
2052 KERNEL1x8_L2 128,16,8,0
2053 KERNEL1x8_L2 128,16,9,0
2054 KERNEL1x8_L2 128,16,10,0
2055 KERNEL1x8_L2 128,16,11,0
2057 KERNEL1x8_L2 128,16,12,0
2058 KERNEL1x8_L2 128,16,13,0
2059 KERNEL1x8_L2 128,16,14,0
2060 KERNEL1x8_E2 128,16,15,1
2066 /*----------------------------------------*/
2070 KERNEL1x8_L2 128,16,0,0
2071 KERNEL1x8_L2 128,16,1,0
2073 KERNEL1x8_L2 128,16,2,0
2074 KERNEL1x8_L2 128,16,3,0
2077 KERNEL1x8_L2 128,16,4,0
2078 KERNEL1x8_L2 128,16,5,0
2080 KERNEL1x8_L2 128,16,6,0
2081 KERNEL1x8_E2 128,16,7,1
2086 CGEMM_1x4_LMAIN_SUB:
2087 /*----------------------------------------*/
2092 /*----------------------------------------*/
2093 KERNEL1x4_L2 64,16,0,0
2095 /*----------------------------------------*/
2096 KERNEL1x4_L2 64,16,1,0
2097 KERNEL1x4_L2 64,16,2,0
2098 KERNEL1x4_L2 64,16,3,0
2099 KERNEL1x4_L2 64,16,4,0
2100 KERNEL1x4_L2 64,16,5,0
2101 KERNEL1x4_L2 64,16,6,0
2102 KERNEL1x4_L2 64,16,7,0
2103 KERNEL1x4_L2 64,16,8,0
2104 KERNEL1x4_L2 64,16,9,0
2105 KERNEL1x4_L2 64,16,10,0
2106 KERNEL1x4_L2 64,16,11,0
2107 KERNEL1x4_L2 64,16,12,0
2108 KERNEL1x4_L2 64,16,13,0
2109 KERNEL1x4_L2 64,16,14,0
2110 KERNEL1x4_L2 64,16,15,1
2111 bdnz CGEMM_L1x4_LOOP
2113 CGEMM_L1x4_LOOP_END:
2114 /*----------------------------------------*/
2121 /*----------------------------------------*/
2123 KERNEL1x4_L2 64,16,0,0
2124 KERNEL1x4_L2 64,16,1,0
2125 KERNEL1x4_L2 64,16,2,0
2126 KERNEL1x4_L2 64,16,3,0
2127 KERNEL1x4_L2 64,16,4,0
2128 KERNEL1x4_L2 64,16,5,0
2129 KERNEL1x4_L2 64,16,6,0
2130 KERNEL1x4_E2 64,16,7,1
2136 /*----------------------------------------*/
2138 KERNEL1x4_L2 64,16,0,0
2139 KERNEL1x4_L2 64,16,1,0
2140 KERNEL1x4_L2 64,16,2,0
2141 KERNEL1x4_E2 64,16,3,1
2145 CGEMM_1x2_LMAIN_SUB:
2146 /*----------------------------------------*/
2151 /*----------------------------------------*/
2152 KERNEL1x2_L2 32,16,0,0
2154 /*----------------------------------------*/
2155 KERNEL1x2_L2 32,16,1,0
2156 KERNEL1x2_L2 32,16,2,0
2157 KERNEL1x2_L2 32,16,3,0
2158 KERNEL1x2_L2 32,16,4,0
2159 KERNEL1x2_L2 32,16,5,0
2160 KERNEL1x2_L2 32,16,6,0
2161 KERNEL1x2_L2 32,16,7,0
2162 KERNEL1x2_L2 32,16,8,0
2163 KERNEL1x2_L2 32,16,9,0
2164 KERNEL1x2_L2 32,16,10,0
2165 KERNEL1x2_L2 32,16,11,0
2166 KERNEL1x2_L2 32,16,12,0
2167 KERNEL1x2_L2 32,16,13,0
2168 KERNEL1x2_L2 32,16,14,0
2169 KERNEL1x2_L2 32,16,15,1
2170 bdnz CGEMM_L1x2_LOOP
2174 CGEMM_L1x2_LOOP_END:
2175 /*----------------------------------------*/
2180 /*----------------------------------------*/
2182 KERNEL1x2_L2 32,16,0,0
2183 KERNEL1x2_L2 32,16,1,0
2184 KERNEL1x2_L2 32,16,2,0
2185 KERNEL1x2_L2 32,16,3,0
2186 KERNEL1x2_L2 32,16,4,0
2187 KERNEL1x2_L2 32,16,5,0
2188 KERNEL1x2_L2 32,16,6,0
2189 KERNEL1x2_E2 32,16,7,1
2193 /*----------------------------------------*/
2195 KERNEL1x2_L2 32,16,0,0
2196 KERNEL1x2_L2 32,16,1,0
2197 KERNEL1x2_L2 32,16,2,0
2198 KERNEL1x2_E2 32,16,3,1
2202 CGEMM_1x1_LMAIN_SUB:
2203 /*----------------------------------------*/
2208 /*----------------------------------------*/
2209 KERNEL1x1_L2 16,16,0,0
2211 /*----------------------------------------*/
2212 KERNEL1x1_L2 16,16,1,0
2213 KERNEL1x1_L2 16,16,2,0
2214 KERNEL1x1_L2 16,16,3,0
2215 KERNEL1x1_L2 16,16,4,0
2216 KERNEL1x1_L2 16,16,5,0
2217 KERNEL1x1_L2 16,16,6,0
2218 KERNEL1x1_L2 16,16,7,0
2219 KERNEL1x1_L2 16,16,8,0
2220 KERNEL1x1_L2 16,16,9,0
2221 KERNEL1x1_L2 16,16,10,0
2222 KERNEL1x1_L2 16,16,11,0
2223 KERNEL1x1_L2 16,16,12,0
2224 KERNEL1x1_L2 16,16,13,0
2225 KERNEL1x1_L2 16,16,14,0
2226 KERNEL1x1_L2 16,16,15,1
2227 bdnz CGEMM_L1x1_LOOP
2229 CGEMM_L1x1_LOOP_END:
2230 /*----------------------------------------*/
2236 /*----------------------------------------*/
2238 KERNEL1x1_L2 16,16,0,0
2239 KERNEL1x1_L2 16,16,1,0
2240 KERNEL1x1_L2 16,16,2,0
2241 KERNEL1x1_L2 16,16,3,0
2242 KERNEL1x1_L2 16,16,4,0
2243 KERNEL1x1_L2 16,16,5,0
2244 KERNEL1x1_L2 16,16,6,0
2245 KERNEL1x1_E2 16,16,7,1
2251 /*----------------------------------------*/
2253 KERNEL1x1_L2 16,16,0,0
2254 KERNEL1x1_L2 16,16,1,0
2255 KERNEL1x1_L2 16,16,2,0
2256 KERNEL1x1_E2 16,16,3,1
2261 /* MAIN LOOP BEGINS */
2266 /*----------------------------------------*/
2272 /*----------------------------------------*/
2277 #if defined(TRMMKERNEL) && defined(LEFT)
2278 mr TEMP_REG, OFFSET /*off = offset;*/
2282 dcbt CO,r0 /*just prefetch*/
2287 /*----------------------------------------*/
2288 #if defined(TRMMKERNEL)
2289 REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1
2295 #if defined(TRMMKERNEL)
2296 REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
2298 /* TEMPS FOR PREFETCH */
2302 /* TEMPS FOR PREFETCH */
2305 srawi. T8, T1, 7 /**(T1-2) % 128x */
2308 /* TEMPS FOR PREFETCH */
2312 /* TEMPS FOR PREFETCH */
2315 srawi. T8, T1, 7 /**(K-2) % 128x */
2319 bl CGEMM_L1x8_LMAIN_SUB
2326 /*----------------------------------------*/
2327 #if defined(TRMMKERNEL)
2345 /*----------------------------------------*/
2346 #if defined(TRMMKERNEL)
2363 /*----------------------------------------*/
2365 ble CGEMM_L1x8_SUB2_32
2366 bl CGEMM_1x8_L64_SUB
2371 /*----------------------------------------*/
2373 ble CGEMM_L1x8_SUB2_16
2374 bl CGEMM_1x8_L32_SUB
2379 /*----------------------------------------*/
2381 ble CGEMM_L1x8_SUB2_8
2382 bl CGEMM_1x8_L16_SUB
2387 /*----------------------------------------*/
2389 ble CGEMM_L1x8_SUB2_4
2391 KERNEL1x8_L2 128,16, 0,0
2392 KERNEL1x8_L2 128,16, 1,0
2393 KERNEL1x8_L2 128,16, 2,0
2394 KERNEL1x8_E2 128,16, 3,1
2399 /*----------------------------------------*/
2401 ble CGEMM_L1x8_SUB2_2
2403 KERNEL1x8_L2 128,16, 0,0
2404 KERNEL1x8_E2 128,16, 1,1
2409 /*----------------------------------------*/
2411 ble CGEMM_L1x8_SUB2_1
2413 KERNEL1x8_E2 128,16, 0,1
2418 /*----------------------------------------*/
2425 /*----------------------------------------*/
2429 #if defined(TRMMKERNEL)
2430 REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
2432 bgt CGEMM_L1x8_BEGIN
2442 /*----------------------------------------*/
2446 /*----------------------------------------*/
2451 #if defined(TRMMKERNEL)
2452 REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1
2456 #if defined(TRMMKERNEL)
2457 REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
2460 srawi. T8, T1, 5 /**(T1-2) % 31x */
2464 srawi. T8, T1, 5 /**(K-2) % 31x */
2468 bl CGEMM_1x4_LMAIN_SUB
2475 /*----------------------------------------*/
2476 #if defined(TRMMKERNEL)
2494 /*----------------------------------------*/
2495 #if defined(TRMMKERNEL)
2513 /*----------------------------------------*/
2515 ble CGEMM_L1x4_SUB2_8
2516 bl CGEMM_1x4_L16_SUB
2521 /*----------------------------------------*/
2523 ble CGEMM_L1x4_SUB2_4
2529 /*----------------------------------------*/
2531 ble CGEMM_L1x4_SUB2_2
2533 KERNEL1x4_L2 64,16, 0,0
2534 KERNEL1x4_E2 64,16, 1,1
2539 /*----------------------------------------*/
2541 ble CGEMM_L1x4_SUB2_1
2543 KERNEL1x4_E2 64,16, 0,1
2548 /*----------------------------------------*/
2555 /*----------------------------------------*/
2557 #if defined(TRMMKERNEL)
2558 REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
2563 /*----------------------------------------*/
2567 /*----------------------------------------*/
2570 #if defined(TRMMKERNEL)
2571 REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1
2575 #if defined(TRMMKERNEL)
2576 REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
2579 srawi. T8, T1, 5 /**(T1-2) % 31x */
2583 srawi. T8, T1, 5 /**(K-2) % 31x */
2587 bl CGEMM_1x2_LMAIN_SUB
2594 /*----------------------------------------*/
2595 #if defined(TRMMKERNEL)
2613 /*----------------------------------------*/
2614 #if defined(TRMMKERNEL)
2632 /*----------------------------------------*/
2634 ble CGEMM_L1x2_SUB2_8
2635 bl CGEMM_1x2_L16_SUB
2640 /*----------------------------------------*/
2642 ble CGEMM_L1x2_SUB2_4
2648 /*----------------------------------------*/
2650 ble CGEMM_L1x2_SUB2_2
2652 KERNEL1x2_L2 32,16, 0,0
2653 KERNEL1x2_E2 32,16, 1,1
2658 /*----------------------------------------*/
2660 ble CGEMM_L1x2_SUB2_1
2662 KERNEL1x2_E2 32,16, 0,1
2667 /*----------------------------------------*/
2674 /*----------------------------------------*/
2676 #if defined(TRMMKERNEL)
2677 REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
2682 /*----------------------------------------*/
2686 /*----------------------------------------*/
2689 #if defined(TRMMKERNEL)
2690 REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1
2694 #if defined(TRMMKERNEL)
2695 REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
2698 srawi. T8, T1, 5 /**(T1-2) % 31x */
2702 srawi. T8, T1, 5 /**(K-2) % 31x */
2706 bl CGEMM_1x1_LMAIN_SUB
2713 /*----------------------------------------*/
2714 #if defined(TRMMKERNEL)
2732 /*----------------------------------------*/
2733 #if defined(TRMMKERNEL)
2751 /*----------------------------------------*/
2753 ble CGEMM_L1x1_SUB2_8
2754 bl CGEMM_1x1_L16_SUB
2759 /*----------------------------------------*/
2761 ble CGEMM_L1x1_SUB2_4
2767 /*----------------------------------------*/
2769 ble CGEMM_L1x1_SUB2_2
2771 KERNEL1x1_L2 16,16, 0,0
2772 KERNEL1x1_E2 16,16, 1,1
2777 /*----------------------------------------*/
2779 ble CGEMM_L1x1_SUB2_1
2781 KERNEL1x1_E2 16,16, 0,1
2786 /*----------------------------------------*/
2793 /*----------------------------------------*/
2796 #if defined(TRMMKERNEL)
2797 REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
2802 /*----------------------------------------*/
2806 #if defined(TRMMKERNEL) && !defined(LEFT)
2807 addi TEMP_REG, TEMP_REG, 1