added dgemm-, dtrmm-, zgemm- and ztrmm-kernel for power8
[platform/upstream/openblas.git] / kernel / power / ztrmm_logic_8x2_power8.S
1         srawi.          J,      N,      1
2         ble             ZTRMM_L2_END
3
4 ZTRMM_L2_BEGIN:
5
6         mr              CO,     C
7         mr              AO,     A
8         slwi            T1,     LDC     ,       1
9         add             C,      C,      T1
10
11 #if defined(LEFT)
12         mr              KK,     OFFSET          // OFFSET -> KK
13 #endif
14
15         srawi.          I,      M,      3
16         ble             ZTRMM_L2x8_END
17
18 ZTRMM_L2x8_BEGIN:
19
20
21 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
22         mr              BO,     B                                       // B -> BO
23 #else
24         mr              BO,     B                                       // B -> BO
25         slwi            T1,     KK,     5                               // Number of values in B shifted
26         slwi            T2,     KK,     7                               // Number of values in A shifted
27         add             BO,     BO,     T1                              // Add values to BO
28         add             AO,     AO,     T2                              // Add values to AO
29 #endif
30
31 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
32         sub             T1,     K,      KK                              // K - KK -> TEMP1
33 #else
34         mr              T1,     KK                                      // KK -> KTEMP
35 #ifdef LEFT
36         addi            T1,     T1,     8                               // KTEMP + Number of values in A -> KTEMP
37 #else
38         addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
39 #endif
40 #endif
41
42         mr              KKK,    T1
43         mr              K1,     T1
44         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
45         ble             ZTRMM_L2x8_SUB0
46         cmpwi           cr0,    L,      1
47         ble             ZTRMM_L2x8_SUB4
48
49 ZTRMM_L2x8_LOOP_START:
50
51         dcbt            AO,     PRE
52         LOAD2x8_1
53         dcbt            AO,     PRE
54         KERNEL2x8_I1
55         dcbt            AO,     PRE
56         KERNEL2x8_2
57         dcbt            AO,     PRE
58         KERNEL2x8_1
59         dcbt            AO,     PRE
60         KERNEL2x8_2
61
62         dcbt            AO,     PRE
63         KERNEL2x8_1
64         dcbt            AO,     PRE
65         KERNEL2x8_2
66         dcbt            AO,     PRE
67         KERNEL2x8_1
68         dcbt            AO,     PRE
69         KERNEL2x8_2
70
71         addic.          L,      L,      -2
72         ble             ZTRMM_L2x8_LOOP_END
73
74         .align 5
75
76 ZTRMM_L2x8_LOOP:
77
78         dcbt            AO,     PRE
79         KERNEL2x8_1
80         dcbt            AO,     PRE
81         KERNEL2x8_2
82         dcbt            AO,     PRE
83         KERNEL2x8_1
84         dcbt            AO,     PRE
85         KERNEL2x8_2
86
87         dcbt            AO,     PRE
88         KERNEL2x8_1
89         dcbt            AO,     PRE
90         KERNEL2x8_2
91         dcbt            AO,     PRE
92         KERNEL2x8_1
93         dcbt            AO,     PRE
94         KERNEL2x8_2
95
96         addic.          L,      L,      -1
97         bgt             ZTRMM_L2x8_LOOP
98
99 ZTRMM_L2x8_LOOP_END:
100
101         dcbt            AO,     PRE
102         KERNEL2x8_1
103         dcbt            AO,     PRE
104         KERNEL2x8_2
105         dcbt            AO,     PRE
106         KERNEL2x8_1
107         dcbt            AO,     PRE
108         KERNEL2x8_2
109
110         dcbt            AO,     PRE
111         KERNEL2x8_1
112         dcbt            AO,     PRE
113         KERNEL2x8_2
114         dcbt            AO,     PRE
115         KERNEL2x8_1
116         KERNEL2x8_E2
117
118         b               ZTRMM_L2x8_SUB1
119
120 ZTRMM_L2x8_SUB4:
121
122         dcbt            AO,     PRE
123         KERNEL2x8_SUBI1
124         dcbt            AO,     PRE
125         KERNEL2x8_SUB1
126         dcbt            AO,     PRE
127         KERNEL2x8_SUB1
128         dcbt            AO,     PRE
129         KERNEL2x8_SUB1
130
131         KERNEL2x8_SUB1
132         KERNEL2x8_SUB1
133         KERNEL2x8_SUB1
134         KERNEL2x8_SUB1
135
136         b               ZTRMM_L2x8_SUB1
137
138 ZTRMM_L2x8_SUB0:
139
140         andi.           L,      K1,     7                                               // K1 & 7 -> L
141
142         KERNEL2x8_SUBI1
143
144         addic.          L,      L,      -1
145         ble             ZTRMM_L2x8_SAVE
146         b               ZTRMM_L2x8_SUB2
147
148 ZTRMM_L2x8_SUB1:
149
150         andi.           L,      K1,     7                                               // K1 & 7 -> L
151         ble             ZTRMM_L2x8_SAVE
152
153 ZTRMM_L2x8_SUB2:
154
155         KERNEL2x8_SUB1
156
157         addic.          L,      L,      -1
158         bgt             ZTRMM_L2x8_SUB2
159
160 ZTRMM_L2x8_SAVE:
161
162         SAVE2x8
163
164 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
165         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
166         slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
167         slwi            T1,     T1,     7                       // TEMP1 * Number of values in A shifted -> TEMP1
168         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
169         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
170 #endif
171
172 #if defined(LEFT)
173         addi            KK,     KK,     8                               // KK += Number of values in A
174 #endif
175
176
177         addic.          I,      I,      -1
178         bgt             ZTRMM_L2x8_BEGIN
179
180 ZTRMM_L2x8_END:
181
182 ZTRMM_L2x4_BEGIN:
183         andi.           T2,     M,      7
184         ble             ZTRMM_L2x1_END
185
186         andi.           T1,     M,      4
187         ble             ZTRMM_L2x4_END
188
189 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
190         mr              BO,     B                                       // B -> BO
191 #else
192         mr              BO,     B                                       // B -> BO
193         slwi            T1,     KK,     5                               // Number of values in B shifted
194         slwi            T2,     KK,     6                               // Number of values in A shifted
195         add             BO,     BO,     T1                              // Add values to BO
196         add             AO,     AO,     T2                              // Add values to AO
197 #endif
198
199 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
200         sub             T1,     K,      KK                              // K - KK -> TEMP1
201 #else
202         mr              T1,     KK                                      // KK -> KTEMP
203 #ifdef LEFT
204         addi            T1,     T1,     4                               // KTEMP + Number of values in A -> KTEMP
205 #else
206         addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
207 #endif
208 #endif
209
210         mr              KKK,    T1
211         mr              K1,     T1
212         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
213         ble             ZTRMM_L2x4_SUB0
214         cmpwi           cr0,    L,      1
215         ble             ZTRMM_L2x4_SUB4
216
217 ZTRMM_L2x4_LOOP_START:
218
219         LOAD2x4_1
220         KERNEL2x4_I1
221         KERNEL2x4_2
222         KERNEL2x4_1
223         KERNEL2x4_2
224
225         KERNEL2x4_1
226         KERNEL2x4_2
227         KERNEL2x4_1
228         KERNEL2x4_2
229
230         addic.          L,      L,      -2
231         ble             ZTRMM_L2x4_LOOP_END
232
233         .align 5
234
235 ZTRMM_L2x4_LOOP:
236
237         KERNEL2x4_1
238         KERNEL2x4_2
239         KERNEL2x4_1
240         KERNEL2x4_2
241
242         KERNEL2x4_1
243         KERNEL2x4_2
244         KERNEL2x4_1
245         KERNEL2x4_2
246
247         addic.          L,      L,      -1
248         bgt             ZTRMM_L2x4_LOOP
249
250 ZTRMM_L2x4_LOOP_END:
251
252         KERNEL2x4_1
253         KERNEL2x4_2
254         KERNEL2x4_1
255         KERNEL2x4_2
256
257         KERNEL2x4_1
258         KERNEL2x4_2
259         KERNEL2x4_1
260         KERNEL2x4_E2
261
262         b               ZTRMM_L2x4_SUB1
263
264 ZTRMM_L2x4_SUB4:
265
266         KERNEL2x4_SUBI1
267         KERNEL2x4_SUB1
268         KERNEL2x4_SUB1
269         KERNEL2x4_SUB1
270
271         KERNEL2x4_SUB1
272         KERNEL2x4_SUB1
273         KERNEL2x4_SUB1
274         KERNEL2x4_SUB1
275
276         b               ZTRMM_L2x4_SUB1
277
278 ZTRMM_L2x4_SUB0:
279
280         andi.           L,      K1,     7                                               // K1 & 7 -> L
281
282         KERNEL2x4_SUBI1
283
284         addic.          L,      L,      -1
285         ble             ZTRMM_L2x4_SAVE
286         b               ZTRMM_L2x4_SUB2
287
288 ZTRMM_L2x4_SUB1:
289
290         andi.           L,      K1,     7                                               // K1 & 7 -> L
291         ble             ZTRMM_L2x4_SAVE
292
293 ZTRMM_L2x4_SUB2:
294
295         KERNEL2x4_SUB1
296
297         addic.          L,      L,      -1
298         bgt             ZTRMM_L2x4_SUB2
299
300 ZTRMM_L2x4_SAVE:
301
302         SAVE2x4
303
304 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
305         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
306         slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
307         slwi            T1,     T1,     6                       // TEMP1 * Number of values in A shifted -> TEMP1
308         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
309         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
310 #endif
311
312 #if defined(LEFT)
313         addi            KK,     KK,     4                               // KK += Number of values in A
314 #endif
315
316
317 ZTRMM_L2x4_END:
318
319 ZTRMM_L2x2_BEGIN:
320
321         andi.           T1,     M,      2
322         ble             ZTRMM_L2x2_END
323
324 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
325         mr              BO,     B                                       // B -> BO
326 #else
327         mr              BO,     B                                       // B -> BO
328         slwi            T1,     KK,     5                               // Number of values in B shifted
329         slwi            T2,     KK,     5                               // Number of values in A shifted
330         add             BO,     BO,     T1                              // Add values to BO
331         add             AO,     AO,     T2                              // Add values to AO
332 #endif
333
334 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
335         sub             T1,     K,      KK                              // K - KK -> TEMP1
336 #else
337         mr              T1,     KK                                      // KK -> KTEMP
338 #ifdef LEFT
339         addi            T1,     T1,     2                               // KTEMP + Number of values in A -> KTEMP
340 #else
341         addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
342 #endif
343 #endif
344
345         mr              KKK,    T1
346         mr              K1,     T1
347         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
348         ble             ZTRMM_L2x2_SUB0
349         cmpwi           cr0,    L,      1
350         ble             ZTRMM_L2x2_SUB4
351
352 ZTRMM_L2x2_LOOP_START:
353
354         LOAD2x2_1
355         KERNEL2x2_I1
356         KERNEL2x2_2
357         KERNEL2x2_1
358         KERNEL2x2_2
359
360         KERNEL2x2_1
361         KERNEL2x2_2
362         KERNEL2x2_1
363         KERNEL2x2_2
364
365         addic.          L,      L,      -2
366         ble             ZTRMM_L2x2_LOOP_END
367
368         .align 5
369
370 ZTRMM_L2x2_LOOP:
371
372         KERNEL2x2_1
373         KERNEL2x2_2
374         KERNEL2x2_1
375         KERNEL2x2_2
376
377         KERNEL2x2_1
378         KERNEL2x2_2
379         KERNEL2x2_1
380         KERNEL2x2_2
381
382         addic.          L,      L,      -1
383         bgt             ZTRMM_L2x2_LOOP
384
385 ZTRMM_L2x2_LOOP_END:
386
387         KERNEL2x2_1
388         KERNEL2x2_2
389         KERNEL2x2_1
390         KERNEL2x2_2
391
392         KERNEL2x2_1
393         KERNEL2x2_2
394         KERNEL2x2_1
395         KERNEL2x2_E2
396
397         b               ZTRMM_L2x2_SUB1
398
399 ZTRMM_L2x2_SUB4:
400
401         KERNEL2x2_SUBI1
402         KERNEL2x2_SUB1
403         KERNEL2x2_SUB1
404         KERNEL2x2_SUB1
405
406         KERNEL2x2_SUB1
407         KERNEL2x2_SUB1
408         KERNEL2x2_SUB1
409         KERNEL2x2_SUB1
410
411         b               ZTRMM_L2x2_SUB1
412
413 ZTRMM_L2x2_SUB0:
414
415         andi.           L,      K1,     7                                               // K1 & 7 -> L
416
417         KERNEL2x2_SUBI1
418
419         addic.          L,      L,      -1
420         ble             ZTRMM_L2x2_SAVE
421         b               ZTRMM_L2x2_SUB2
422
423 ZTRMM_L2x2_SUB1:
424
425         andi.           L,      K1,     7                                               // K1 & 7 -> L
426         ble             ZTRMM_L2x2_SAVE
427
428 ZTRMM_L2x2_SUB2:
429
430         KERNEL2x2_SUB1
431
432         addic.          L,      L,      -1
433         bgt             ZTRMM_L2x2_SUB2
434
435 ZTRMM_L2x2_SAVE:
436
437         SAVE2x2
438
439 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
440         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
441         slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
442         slwi            T1,     T1,     5                       // TEMP1 * Number of values in A shifted -> TEMP1
443         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
444         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
445 #endif
446
447 #if defined(LEFT)
448         addi            KK,     KK,     2                               // KK += Number of values in A
449 #endif
450
451
452 ZTRMM_L2x2_END:
453
454 ZTRMM_L2x1_BEGIN:
455
456         andi.           T1,     M,      1
457         ble             ZTRMM_L2x1_END
458
459 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
460         mr              BO,     B                                       // B -> BO
461 #else
462         mr              BO,     B                                       // B -> BO
463         slwi            T1,     KK,     5                               // Number of values in B shifted
464         slwi            T2,     KK,     4                               // Number of values in A shifted
465         add             BO,     BO,     T1                              // Add values to BO
466         add             AO,     AO,     T2                              // Add values to AO
467 #endif
468
469 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
470         sub             T1,     K,      KK                              // K - KK -> TEMP1
471 #else
472         mr              T1,     KK                                      // KK -> KTEMP
473 #ifdef LEFT
474         addi            T1,     T1,     1                               // KTEMP + Number of values in A -> KTEMP
475 #else
476         addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
477 #endif
478 #endif
479
480         mr              KKK,    T1
481         mr              K1,     T1
482         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
483         ble             ZTRMM_L2x1_SUB0
484         cmpwi           cr0,    L,      1
485         ble             ZTRMM_L2x1_SUB4
486
487 ZTRMM_L2x1_LOOP_START:
488
489         LOAD2x1_1
490         KERNEL2x1_I1
491         KERNEL2x1_2
492         KERNEL2x1_1
493         KERNEL2x1_2
494
495         KERNEL2x1_1
496         KERNEL2x1_2
497         KERNEL2x1_1
498         KERNEL2x1_2
499
500         addic.          L,      L,      -2
501         ble             ZTRMM_L2x1_LOOP_END
502
503         .align 5
504
505 ZTRMM_L2x1_LOOP:
506
507         KERNEL2x1_1
508         KERNEL2x1_2
509         KERNEL2x1_1
510         KERNEL2x1_2
511
512         KERNEL2x1_1
513         KERNEL2x1_2
514         KERNEL2x1_1
515         KERNEL2x1_2
516
517         addic.          L,      L,      -1
518         bgt             ZTRMM_L2x1_LOOP
519
520 ZTRMM_L2x1_LOOP_END:
521
522         KERNEL2x1_1
523         KERNEL2x1_2
524         KERNEL2x1_1
525         KERNEL2x1_2
526
527         KERNEL2x1_1
528         KERNEL2x1_2
529         KERNEL2x1_1
530         KERNEL2x1_E2
531
532         b               ZTRMM_L2x1_SUB1
533
534 ZTRMM_L2x1_SUB4:
535
536         KERNEL2x1_SUBI1
537         KERNEL2x1_SUB1
538         KERNEL2x1_SUB1
539         KERNEL2x1_SUB1
540
541         KERNEL2x1_SUB1
542         KERNEL2x1_SUB1
543         KERNEL2x1_SUB1
544         KERNEL2x1_SUB1
545
546         b               ZTRMM_L2x1_SUB1
547
548 ZTRMM_L2x1_SUB0:
549
550         andi.           L,      K1,     7                                               // K1 & 7 -> L
551
552         KERNEL2x1_SUBI1
553
554         addic.          L,      L,      -1
555         ble             ZTRMM_L2x1_SAVE
556         b               ZTRMM_L2x1_SUB2
557
558 ZTRMM_L2x1_SUB1:
559
560         andi.           L,      K1,     7                                               // K1 & 7 -> L
561         ble             ZTRMM_L2x1_SAVE
562
563 ZTRMM_L2x1_SUB2:
564
565         KERNEL2x1_SUB1
566
567         addic.          L,      L,      -1
568         bgt             ZTRMM_L2x1_SUB2
569
570 ZTRMM_L2x1_SAVE:
571
572         SAVE2x1
573
574 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
575         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
576         slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
577         slwi            T1,     T1,     4                       // TEMP1 * Number of values in A shifted -> TEMP1
578         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
579         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
580 #endif
581
582 #if defined(LEFT)
583         addi            KK,     KK,     1                               // KK += Number of values in A
584 #endif
585
586
587 ZTRMM_L2x1_END:
588
589         slwi            T1,     K,      5
590         add             B,      B,      T1
591
592 #if !defined(LEFT)
593         addi            KK,     KK,     2                                       // KK += Number of values in B
594 #endif
595
596
597         addic.          J,      J,      -1
598         bgt             ZTRMM_L2_BEGIN
599
600         andi.           T2,     N,      1
601         ble             L999
602
603 ZTRMM_L2_END:
604
605         b               ZTRMM_L1_BEGIN
606
607 L999_H1:
608
609         b               L999
610
611 ZTRMM_L1_BEGIN:
612
613         andi.           T1,     N,      1
614         ble             ZTRMM_L1_END
615         mr              CO,     C
616         mr              AO,     A
617
618 #if defined(LEFT)
619         mr              KK,     OFFSET          // OFFSET -> KK
620 #endif
621
622         srawi.          I,      M,      3
623         ble             ZTRMM_L1x8_END
624
625 ZTRMM_L1x8_BEGIN:
626
627
628 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
629         mr              BO,     B                                       // B -> BO
630 #else
631         mr              BO,     B                                       // B -> BO
632         slwi            T1,     KK,     4                               // Number of values in B shifted
633         slwi            T2,     KK,     7                               // Number of values in A shifted
634         add             BO,     BO,     T1                              // Add values to BO
635         add             AO,     AO,     T2                              // Add values to AO
636 #endif
637
638 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
639         sub             T1,     K,      KK                              // K - KK -> TEMP1
640 #else
641         mr              T1,     KK                                      // KK -> KTEMP
642 #ifdef LEFT
643         addi            T1,     T1,     8                               // KTEMP + Number of values in A -> KTEMP
644 #else
645         addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
646 #endif
647 #endif
648
649         mr              KKK,    T1
650         mr              K1,     T1
651         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
652         ble             ZTRMM_L1x8_SUB0
653         cmpwi           cr0,    L,      1
654         ble             ZTRMM_L1x8_SUB4
655
656 ZTRMM_L1x8_LOOP_START:
657
658         dcbt            AO,     PRE
659         LOAD1x8_1
660         dcbt            AO,     PRE
661         KERNEL1x8_I1
662         dcbt            AO,     PRE
663         KERNEL1x8_2
664         dcbt            AO,     PRE
665         KERNEL1x8_1
666         dcbt            AO,     PRE
667         KERNEL1x8_2
668
669         dcbt            AO,     PRE
670         KERNEL1x8_1
671         dcbt            AO,     PRE
672         KERNEL1x8_2
673         dcbt            AO,     PRE
674         KERNEL1x8_1
675         dcbt            AO,     PRE
676         KERNEL1x8_2
677
678         addic.          L,      L,      -2
679         ble             ZTRMM_L1x8_LOOP_END
680
681         .align 5
682
683 ZTRMM_L1x8_LOOP:
684
685         dcbt            AO,     PRE
686         KERNEL1x8_1
687         dcbt            AO,     PRE
688         KERNEL1x8_2
689         dcbt            AO,     PRE
690         KERNEL1x8_1
691         dcbt            AO,     PRE
692         KERNEL1x8_2
693
694         dcbt            AO,     PRE
695         KERNEL1x8_1
696         dcbt            AO,     PRE
697         KERNEL1x8_2
698         dcbt            AO,     PRE
699         KERNEL1x8_1
700         dcbt            AO,     PRE
701         KERNEL1x8_2
702
703         addic.          L,      L,      -1
704         bgt             ZTRMM_L1x8_LOOP
705
706 ZTRMM_L1x8_LOOP_END:
707
708         dcbt            AO,     PRE
709         KERNEL1x8_1
710         dcbt            AO,     PRE
711         KERNEL1x8_2
712         dcbt            AO,     PRE
713         KERNEL1x8_1
714         dcbt            AO,     PRE
715         KERNEL1x8_2
716
717         dcbt            AO,     PRE
718         KERNEL1x8_1
719         dcbt            AO,     PRE
720         KERNEL1x8_2
721         dcbt            AO,     PRE
722         KERNEL1x8_1
723         KERNEL1x8_E2
724
725         b               ZTRMM_L1x8_SUB1
726
727 ZTRMM_L1x8_SUB4:
728
729         dcbt            AO,     PRE
730         KERNEL1x8_SUBI1
731         dcbt            AO,     PRE
732         KERNEL1x8_SUB1
733         dcbt            AO,     PRE
734         KERNEL1x8_SUB1
735         dcbt            AO,     PRE
736         KERNEL1x8_SUB1
737
738         KERNEL1x8_SUB1
739         KERNEL1x8_SUB1
740         KERNEL1x8_SUB1
741         KERNEL1x8_SUB1
742
743         b               ZTRMM_L1x8_SUB1
744
745 ZTRMM_L1x8_SUB0:
746
747         andi.           L,      K1,     7                                               // K1 & 7 -> L
748
749         KERNEL1x8_SUBI1
750
751         addic.          L,      L,      -1
752         ble             ZTRMM_L1x8_SAVE
753         b               ZTRMM_L1x8_SUB2
754
755 ZTRMM_L1x8_SUB1:
756
757         andi.           L,      K1,     7                                               // K1 & 7 -> L
758         ble             ZTRMM_L1x8_SAVE
759
760 ZTRMM_L1x8_SUB2:
761
762         KERNEL1x8_SUB1
763
764         addic.          L,      L,      -1
765         bgt             ZTRMM_L1x8_SUB2
766
767 ZTRMM_L1x8_SAVE:
768
769         SAVE1x8
770
771 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
772         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
773         slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
774         slwi            T1,     T1,     7                       // TEMP1 * Number of values in A shifted -> TEMP1
775         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
776         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
777 #endif
778
779 #if defined(LEFT)
780         addi            KK,     KK,     8                               // KK += Number of values in A
781 #endif
782
783
784         addic.          I,      I,      -1
785         bgt             ZTRMM_L1x8_BEGIN
786
787 ZTRMM_L1x8_END:
788
789 ZTRMM_L1x4_BEGIN:
790         andi.           T2,     M,      7
791         ble             ZTRMM_L1x1_END
792
793         andi.           T1,     M,      4
794         ble             ZTRMM_L1x4_END
795
796 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
797         mr              BO,     B                                       // B -> BO
798 #else
799         mr              BO,     B                                       // B -> BO
800         slwi            T1,     KK,     4                               // Number of values in B shifted
801         slwi            T2,     KK,     6                               // Number of values in A shifted
802         add             BO,     BO,     T1                              // Add values to BO
803         add             AO,     AO,     T2                              // Add values to AO
804 #endif
805
806 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
807         sub             T1,     K,      KK                              // K - KK -> TEMP1
808 #else
809         mr              T1,     KK                                      // KK -> KTEMP
810 #ifdef LEFT
811         addi            T1,     T1,     4                               // KTEMP + Number of values in A -> KTEMP
812 #else
813         addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
814 #endif
815 #endif
816
817         mr              KKK,    T1
818         mr              K1,     T1
819         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
820         ble             ZTRMM_L1x4_SUB0
821         cmpwi           cr0,    L,      1
822         ble             ZTRMM_L1x4_SUB4
823
824 ZTRMM_L1x4_LOOP_START:
825
826         LOAD1x4_1
827         KERNEL1x4_I1
828         KERNEL1x4_2
829         KERNEL1x4_1
830         KERNEL1x4_2
831
832         KERNEL1x4_1
833         KERNEL1x4_2
834         KERNEL1x4_1
835         KERNEL1x4_2
836
837         addic.          L,      L,      -2
838         ble             ZTRMM_L1x4_LOOP_END
839
840         .align 5
841
842 ZTRMM_L1x4_LOOP:
843
844         KERNEL1x4_1
845         KERNEL1x4_2
846         KERNEL1x4_1
847         KERNEL1x4_2
848
849         KERNEL1x4_1
850         KERNEL1x4_2
851         KERNEL1x4_1
852         KERNEL1x4_2
853
854         addic.          L,      L,      -1
855         bgt             ZTRMM_L1x4_LOOP
856
857 ZTRMM_L1x4_LOOP_END:
858
859         KERNEL1x4_1
860         KERNEL1x4_2
861         KERNEL1x4_1
862         KERNEL1x4_2
863
864         KERNEL1x4_1
865         KERNEL1x4_2
866         KERNEL1x4_1
867         KERNEL1x4_E2
868
869         b               ZTRMM_L1x4_SUB1
870
871 ZTRMM_L1x4_SUB4:
872
873         KERNEL1x4_SUBI1
874         KERNEL1x4_SUB1
875         KERNEL1x4_SUB1
876         KERNEL1x4_SUB1
877
878         KERNEL1x4_SUB1
879         KERNEL1x4_SUB1
880         KERNEL1x4_SUB1
881         KERNEL1x4_SUB1
882
883         b               ZTRMM_L1x4_SUB1
884
885 ZTRMM_L1x4_SUB0:
886
887         andi.           L,      K1,     7                                               // K1 & 7 -> L
888
889         KERNEL1x4_SUBI1
890
891         addic.          L,      L,      -1
892         ble             ZTRMM_L1x4_SAVE
893         b               ZTRMM_L1x4_SUB2
894
895 ZTRMM_L1x4_SUB1:
896
897         andi.           L,      K1,     7                                               // K1 & 7 -> L
898         ble             ZTRMM_L1x4_SAVE
899
900 ZTRMM_L1x4_SUB2:
901
902         KERNEL1x4_SUB1
903
904         addic.          L,      L,      -1
905         bgt             ZTRMM_L1x4_SUB2
906
907 ZTRMM_L1x4_SAVE:
908
909         SAVE1x4
910
911 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
912         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
913         slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
914         slwi            T1,     T1,     6                       // TEMP1 * Number of values in A shifted -> TEMP1
915         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
916         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
917 #endif
918
919 #if defined(LEFT)
920         addi            KK,     KK,     4                               // KK += Number of values in A
921 #endif
922
923
924 ZTRMM_L1x4_END:
925
926 ZTRMM_L1x2_BEGIN:
927
928         andi.           T1,     M,      2
929         ble             ZTRMM_L1x2_END
930
931 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
932         mr              BO,     B                                       // B -> BO
933 #else
934         mr              BO,     B                                       // B -> BO
935         slwi            T1,     KK,     4                               // Number of values in B shifted
936         slwi            T2,     KK,     5                               // Number of values in A shifted
937         add             BO,     BO,     T1                              // Add values to BO
938         add             AO,     AO,     T2                              // Add values to AO
939 #endif
940
941 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
942         sub             T1,     K,      KK                              // K - KK -> TEMP1
943 #else
944         mr              T1,     KK                                      // KK -> KTEMP
945 #ifdef LEFT
946         addi            T1,     T1,     2                               // KTEMP + Number of values in A -> KTEMP
947 #else
948         addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
949 #endif
950 #endif
951
952         mr              KKK,    T1
953         mr              K1,     T1
954         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
955         ble             ZTRMM_L1x2_SUB0
956         cmpwi           cr0,    L,      1
957         ble             ZTRMM_L1x2_SUB4
958
959 ZTRMM_L1x2_LOOP_START:
960
961         LOAD1x2_1
962         KERNEL1x2_I1
963         KERNEL1x2_2
964         KERNEL1x2_1
965         KERNEL1x2_2
966
967         KERNEL1x2_1
968         KERNEL1x2_2
969         KERNEL1x2_1
970         KERNEL1x2_2
971
972         addic.          L,      L,      -2
973         ble             ZTRMM_L1x2_LOOP_END
974
975         .align 5
976
977 ZTRMM_L1x2_LOOP:
978
979         KERNEL1x2_1
980         KERNEL1x2_2
981         KERNEL1x2_1
982         KERNEL1x2_2
983
984         KERNEL1x2_1
985         KERNEL1x2_2
986         KERNEL1x2_1
987         KERNEL1x2_2
988
989         addic.          L,      L,      -1
990         bgt             ZTRMM_L1x2_LOOP
991
992 ZTRMM_L1x2_LOOP_END:
993
994         KERNEL1x2_1
995         KERNEL1x2_2
996         KERNEL1x2_1
997         KERNEL1x2_2
998
999         KERNEL1x2_1
1000         KERNEL1x2_2
1001         KERNEL1x2_1
1002         KERNEL1x2_E2
1003
1004         b               ZTRMM_L1x2_SUB1
1005
1006 ZTRMM_L1x2_SUB4:
1007
1008         KERNEL1x2_SUBI1
1009         KERNEL1x2_SUB1
1010         KERNEL1x2_SUB1
1011         KERNEL1x2_SUB1
1012
1013         KERNEL1x2_SUB1
1014         KERNEL1x2_SUB1
1015         KERNEL1x2_SUB1
1016         KERNEL1x2_SUB1
1017
1018         b               ZTRMM_L1x2_SUB1
1019
1020 ZTRMM_L1x2_SUB0:
1021
1022         andi.           L,      K1,     7                                               // K1 & 7 -> L
1023
1024         KERNEL1x2_SUBI1
1025
1026         addic.          L,      L,      -1
1027         ble             ZTRMM_L1x2_SAVE
1028         b               ZTRMM_L1x2_SUB2
1029
1030 ZTRMM_L1x2_SUB1:
1031
1032         andi.           L,      K1,     7                                               // K1 & 7 -> L
1033         ble             ZTRMM_L1x2_SAVE
1034
1035 ZTRMM_L1x2_SUB2:
1036
1037         KERNEL1x2_SUB1
1038
1039         addic.          L,      L,      -1
1040         bgt             ZTRMM_L1x2_SUB2
1041
1042 ZTRMM_L1x2_SAVE:
1043
1044         SAVE1x2
1045
1046 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1047         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
1048         slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
1049         slwi            T1,     T1,     5                       // TEMP1 * Number of values in A shifted -> TEMP1
1050         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
1051         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
1052 #endif
1053
1054 #if defined(LEFT)
1055         addi            KK,     KK,     2                               // KK += Number of values in A
1056 #endif
1057
1058
1059 ZTRMM_L1x2_END:
1060
1061 ZTRMM_L1x1_BEGIN:
1062
1063         andi.           T1,     M,      1
1064         ble             ZTRMM_L1x1_END
1065
1066 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1067         mr              BO,     B                                       // B -> BO
1068 #else
1069         mr              BO,     B                                       // B -> BO
1070         slwi            T1,     KK,     4                               // Number of values in B shifted
1071         slwi            T2,     KK,     4                               // Number of values in A shifted
1072         add             BO,     BO,     T1                              // Add values to BO
1073         add             AO,     AO,     T2                              // Add values to AO
1074 #endif
1075
1076 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1077         sub             T1,     K,      KK                              // K - KK -> TEMP1
1078 #else
1079         mr              T1,     KK                                      // KK -> KTEMP
1080 #ifdef LEFT
1081         addi            T1,     T1,     1                               // KTEMP + Number of values in A -> KTEMP
1082 #else
1083         addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
1084 #endif
1085 #endif
1086
1087         mr              KKK,    T1
1088         mr              K1,     T1
1089         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
1090         ble             ZTRMM_L1x1_SUB0
1091         cmpwi           cr0,    L,      1
1092         ble             ZTRMM_L1x1_SUB4
1093
1094 ZTRMM_L1x1_LOOP_START:
1095
1096         LOAD1x1_1
1097         KERNEL1x1_I1
1098         KERNEL1x1_2
1099         KERNEL1x1_1
1100         KERNEL1x1_2
1101
1102         KERNEL1x1_1
1103         KERNEL1x1_2
1104         KERNEL1x1_1
1105         KERNEL1x1_2
1106
1107         addic.          L,      L,      -2
1108         ble             ZTRMM_L1x1_LOOP_END
1109
1110         .align 5
1111
1112 ZTRMM_L1x1_LOOP:
1113
1114         KERNEL1x1_1
1115         KERNEL1x1_2
1116         KERNEL1x1_1
1117         KERNEL1x1_2
1118
1119         KERNEL1x1_1
1120         KERNEL1x1_2
1121         KERNEL1x1_1
1122         KERNEL1x1_2
1123
1124         addic.          L,      L,      -1
1125         bgt             ZTRMM_L1x1_LOOP
1126
1127 ZTRMM_L1x1_LOOP_END:
1128
1129         KERNEL1x1_1
1130         KERNEL1x1_2
1131         KERNEL1x1_1
1132         KERNEL1x1_2
1133
1134         KERNEL1x1_1
1135         KERNEL1x1_2
1136         KERNEL1x1_1
1137         KERNEL1x1_E2
1138
1139         b               ZTRMM_L1x1_SUB1
1140
1141 ZTRMM_L1x1_SUB4:
1142
1143         KERNEL1x1_SUBI1
1144         KERNEL1x1_SUB1
1145         KERNEL1x1_SUB1
1146         KERNEL1x1_SUB1
1147
1148         KERNEL1x1_SUB1
1149         KERNEL1x1_SUB1
1150         KERNEL1x1_SUB1
1151         KERNEL1x1_SUB1
1152
1153         b               ZTRMM_L1x1_SUB1
1154
1155 ZTRMM_L1x1_SUB0:
1156
1157         andi.           L,      K1,     7                                               // K1 & 7 -> L
1158
1159         KERNEL1x1_SUBI1
1160
1161         addic.          L,      L,      -1
1162         ble             ZTRMM_L1x1_SAVE
1163         b               ZTRMM_L1x1_SUB2
1164
1165 ZTRMM_L1x1_SUB1:
1166
1167         andi.           L,      K1,     7                                               // K1 & 7 -> L
1168         ble             ZTRMM_L1x1_SAVE
1169
1170 ZTRMM_L1x1_SUB2:
1171
1172         KERNEL1x1_SUB1
1173
1174         addic.          L,      L,      -1
1175         bgt             ZTRMM_L1x1_SUB2
1176
1177 ZTRMM_L1x1_SAVE:
1178
1179         SAVE1x1
1180
1181 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1182         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
1183         slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
1184         slwi            T1,     T1,     4                       // TEMP1 * Number of values in A shifted -> TEMP1
1185         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
1186         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
1187 #endif
1188
1189 #if defined(LEFT)
1190         addi            KK,     KK,     1                               // KK += Number of values in A
1191 #endif
1192
1193
1194 ZTRMM_L1x1_END:
1195
1196 #if !defined(LEFT)
1197         addi            KK,     KK,     1                                       // KK += Number of values in B
1198 #endif
1199
1200
1201 ZTRMM_L1_END: