added dgemm-, dtrmm-, zgemm- and ztrmm-kernel for power8
[platform/upstream/openblas.git] / kernel / power / dtrmm_logic_16x4_power8.S
1         srawi.          J,      N,      2
2         ble             DTRMM_L4_END
3
4 DTRMM_L4_BEGIN:
5
6         mr              CO,     C
7         mr              AO,     A
8         slwi            T1,     LDC     ,       2
9         add             C,      C,      T1
10
11 #if defined(LEFT)
12         mr              KK,     OFFSET          // OFFSET -> KK
13 #endif
14
15         srawi.          I,      M,      4
16         ble             DTRMM_L4x16_END
17
18 DTRMM_L4x16_BEGIN:
19
20
21 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
22         mr              BO,     B                                       // B -> BO
23 #else
24         mr              BO,     B                                       // B -> BO
25         slwi            T1,     KK,     5                               // Number of values in B shifted
26         slwi            T2,     KK,     7                               // Number of values in A shifted
27         add             BO,     BO,     T1                              // Add values to BO
28         add             AO,     AO,     T2                              // Add values to AO
29 #endif
30
31 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
32         sub             T1,     K,      KK                              // K - KK -> TEMP1
33 #else
34         mr              T1,     KK                                      // KK -> KTEMP
35 #ifdef LEFT
36         addi            T1,     T1,     16                              // KTEMP + Number of values in A -> KTEMP
37 #else
38         addi            T1,     T1,     4                               // KTEMP + Number of values in B -> KTEMP
39 #endif
40 #endif
41
42         mr              KKK,    T1
43         mr              K1,     T1
44         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
45         ble             DTRMM_L4x16_SUB0
46         cmpwi           cr0,    L,      1
47         ble             DTRMM_L4x16_SUB4
48
49 DTRMM_L4x16_LOOP_START:
50
51         dcbt            AO,     PRE
52         LOAD4x16_1
53         dcbt            AO,     PRE
54         KERNEL4x16_I1
55         dcbt            AO,     PRE
56         KERNEL4x16_2
57         dcbt            AO,     PRE
58         KERNEL4x16_1
59         dcbt            AO,     PRE
60         KERNEL4x16_2
61
62         dcbt            AO,     PRE
63         KERNEL4x16_1
64         dcbt            AO,     PRE
65         KERNEL4x16_2
66         dcbt            AO,     PRE
67         KERNEL4x16_1
68         dcbt            AO,     PRE
69         KERNEL4x16_2
70
71         addic.          L,      L,      -2
72         ble             DTRMM_L4x16_LOOP_END
73
74         .align 5
75
76 DTRMM_L4x16_LOOP:
77
78         dcbt            AO,     PRE
79         KERNEL4x16_1
80         dcbt            AO,     PRE
81         KERNEL4x16_2
82         dcbt            AO,     PRE
83         KERNEL4x16_1
84         dcbt            AO,     PRE
85         KERNEL4x16_2
86
87         dcbt            AO,     PRE
88         KERNEL4x16_1
89         dcbt            AO,     PRE
90         KERNEL4x16_2
91         dcbt            AO,     PRE
92         KERNEL4x16_1
93         dcbt            AO,     PRE
94         KERNEL4x16_2
95
96         addic.          L,      L,      -1
97         bgt             DTRMM_L4x16_LOOP
98
99 DTRMM_L4x16_LOOP_END:
100
101         dcbt            AO,     PRE
102         KERNEL4x16_1
103         dcbt            AO,     PRE
104         KERNEL4x16_2
105         dcbt            AO,     PRE
106         KERNEL4x16_1
107         dcbt            AO,     PRE
108         KERNEL4x16_2
109
110         dcbt            AO,     PRE
111         KERNEL4x16_1
112         dcbt            AO,     PRE
113         KERNEL4x16_2
114         dcbt            AO,     PRE
115         KERNEL4x16_1
116         KERNEL4x16_E2
117
118         b               DTRMM_L4x16_SUB1
119
120 DTRMM_L4x16_SUB4:
121
122         dcbt            AO,     PRE
123         KERNEL4x16_SUBI1
124         dcbt            AO,     PRE
125         KERNEL4x16_SUB1
126         dcbt            AO,     PRE
127         KERNEL4x16_SUB1
128         dcbt            AO,     PRE
129         KERNEL4x16_SUB1
130
131         KERNEL4x16_SUB1
132         KERNEL4x16_SUB1
133         KERNEL4x16_SUB1
134         KERNEL4x16_SUB1
135
136         b               DTRMM_L4x16_SUB1
137
138 DTRMM_L4x16_SUB0:
139
140         andi.           L,      K1,     7                                               // K1 & 7 -> L
141
142         KERNEL4x16_SUBI1
143
144         addic.          L,      L,      -1
145         ble             DTRMM_L4x16_SAVE
146         b               DTRMM_L4x16_SUB2
147
148 DTRMM_L4x16_SUB1:
149
150         andi.           L,      K1,     7                                               // K1 & 7 -> L
151         ble             DTRMM_L4x16_SAVE
152
153 DTRMM_L4x16_SUB2:
154
155         KERNEL4x16_SUB1
156
157         addic.          L,      L,      -1
158         bgt             DTRMM_L4x16_SUB2
159
160 DTRMM_L4x16_SAVE:
161
162         SAVE4x16
163
164 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
165         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
166         slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
167         slwi            T1,     T1,     7                       // TEMP1 * Number of values in A shifted -> TEMP1
168         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
169         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
170 #endif
171
172 #if defined(LEFT)
173         addi            KK,     KK,     16                              // KK += Number of values in A
174 #endif
175
176
177         addic.          I,      I,      -1
178         bgt             DTRMM_L4x16_BEGIN
179
180 DTRMM_L4x16_END:
181
182 DTRMM_L4x8_BEGIN:
183         andi.           T2,     M,      15
184         ble             DTRMM_L4x1_END
185
186         andi.           T1,     M,      8
187         ble             DTRMM_L4x8_END
188
189 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
190         mr              BO,     B                                       // B -> BO
191 #else
192         mr              BO,     B                                       // B -> BO
193         slwi            T1,     KK,     5                               // Number of values in B shifted
194         slwi            T2,     KK,     6                               // Number of values in A shifted
195         add             BO,     BO,     T1                              // Add values to BO
196         add             AO,     AO,     T2                              // Add values to AO
197 #endif
198
199 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
200         sub             T1,     K,      KK                              // K - KK -> TEMP1
201 #else
202         mr              T1,     KK                                      // KK -> KTEMP
203 #ifdef LEFT
204         addi            T1,     T1,     8                               // KTEMP + Number of values in A -> KTEMP
205 #else
206         addi            T1,     T1,     4                               // KTEMP + Number of values in B -> KTEMP
207 #endif
208 #endif
209
210         mr              KKK,    T1
211         mr              K1,     T1
212         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
213         ble             DTRMM_L4x8_SUB0
214         cmpwi           cr0,    L,      1
215         ble             DTRMM_L4x8_SUB4
216
217 DTRMM_L4x8_LOOP_START:
218
219         LOAD4x8_1
220         KERNEL4x8_I1
221         KERNEL4x8_2
222         KERNEL4x8_1
223         KERNEL4x8_2
224
225         KERNEL4x8_1
226         KERNEL4x8_2
227         KERNEL4x8_1
228         KERNEL4x8_2
229
230         addic.          L,      L,      -2
231         ble             DTRMM_L4x8_LOOP_END
232
233         .align 5
234
235 DTRMM_L4x8_LOOP:
236
237         KERNEL4x8_1
238         KERNEL4x8_2
239         KERNEL4x8_1
240         KERNEL4x8_2
241
242         KERNEL4x8_1
243         KERNEL4x8_2
244         KERNEL4x8_1
245         KERNEL4x8_2
246
247         addic.          L,      L,      -1
248         bgt             DTRMM_L4x8_LOOP
249
250 DTRMM_L4x8_LOOP_END:
251
252         KERNEL4x8_1
253         KERNEL4x8_2
254         KERNEL4x8_1
255         KERNEL4x8_2
256
257         KERNEL4x8_1
258         KERNEL4x8_2
259         KERNEL4x8_1
260         KERNEL4x8_E2
261
262         b               DTRMM_L4x8_SUB1
263
264 DTRMM_L4x8_SUB4:
265
266         KERNEL4x8_SUBI1
267         KERNEL4x8_SUB1
268         KERNEL4x8_SUB1
269         KERNEL4x8_SUB1
270
271         KERNEL4x8_SUB1
272         KERNEL4x8_SUB1
273         KERNEL4x8_SUB1
274         KERNEL4x8_SUB1
275
276         b               DTRMM_L4x8_SUB1
277
278 DTRMM_L4x8_SUB0:
279
280         andi.           L,      K1,     7                                               // K1 & 7 -> L
281
282         KERNEL4x8_SUBI1
283
284         addic.          L,      L,      -1
285         ble             DTRMM_L4x8_SAVE
286         b               DTRMM_L4x8_SUB2
287
288 DTRMM_L4x8_SUB1:
289
290         andi.           L,      K1,     7                                               // K1 & 7 -> L
291         ble             DTRMM_L4x8_SAVE
292
293 DTRMM_L4x8_SUB2:
294
295         KERNEL4x8_SUB1
296
297         addic.          L,      L,      -1
298         bgt             DTRMM_L4x8_SUB2
299
300 DTRMM_L4x8_SAVE:
301
302         SAVE4x8
303
304 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
305         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
306         slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
307         slwi            T1,     T1,     6                       // TEMP1 * Number of values in A shifted -> TEMP1
308         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
309         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
310 #endif
311
312 #if defined(LEFT)
313         addi            KK,     KK,     8                               // KK += Number of values in A
314 #endif
315
316
317 DTRMM_L4x8_END:
318
319 DTRMM_L4x4_BEGIN:
320
321         andi.           T1,     M,      4
322         ble             DTRMM_L4x4_END
323
324 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
325         mr              BO,     B                                       // B -> BO
326 #else
327         mr              BO,     B                                       // B -> BO
328         slwi            T1,     KK,     5                               // Number of values in B shifted
329         slwi            T2,     KK,     5                               // Number of values in A shifted
330         add             BO,     BO,     T1                              // Add values to BO
331         add             AO,     AO,     T2                              // Add values to AO
332 #endif
333
334 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
335         sub             T1,     K,      KK                              // K - KK -> TEMP1
336 #else
337         mr              T1,     KK                                      // KK -> KTEMP
338 #ifdef LEFT
339         addi            T1,     T1,     4                               // KTEMP + Number of values in A -> KTEMP
340 #else
341         addi            T1,     T1,     4                               // KTEMP + Number of values in B -> KTEMP
342 #endif
343 #endif
344
345         mr              KKK,    T1
346         mr              K1,     T1
347         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
348         ble             DTRMM_L4x4_SUB0
349         cmpwi           cr0,    L,      1
350         ble             DTRMM_L4x4_SUB4
351
352 DTRMM_L4x4_LOOP_START:
353
354         LOAD4x4_1
355         KERNEL4x4_I1
356         KERNEL4x4_2
357         KERNEL4x4_1
358         KERNEL4x4_2
359
360         KERNEL4x4_1
361         KERNEL4x4_2
362         KERNEL4x4_1
363         KERNEL4x4_2
364
365         addic.          L,      L,      -2
366         ble             DTRMM_L4x4_LOOP_END
367
368         .align 5
369
370 DTRMM_L4x4_LOOP:
371
372         KERNEL4x4_1
373         KERNEL4x4_2
374         KERNEL4x4_1
375         KERNEL4x4_2
376
377         KERNEL4x4_1
378         KERNEL4x4_2
379         KERNEL4x4_1
380         KERNEL4x4_2
381
382         addic.          L,      L,      -1
383         bgt             DTRMM_L4x4_LOOP
384
385 DTRMM_L4x4_LOOP_END:
386
387         KERNEL4x4_1
388         KERNEL4x4_2
389         KERNEL4x4_1
390         KERNEL4x4_2
391
392         KERNEL4x4_1
393         KERNEL4x4_2
394         KERNEL4x4_1
395         KERNEL4x4_E2
396
397         b               DTRMM_L4x4_SUB1
398
399 DTRMM_L4x4_SUB4:
400
401         KERNEL4x4_SUBI1
402         KERNEL4x4_SUB1
403         KERNEL4x4_SUB1
404         KERNEL4x4_SUB1
405
406         KERNEL4x4_SUB1
407         KERNEL4x4_SUB1
408         KERNEL4x4_SUB1
409         KERNEL4x4_SUB1
410
411         b               DTRMM_L4x4_SUB1
412
413 DTRMM_L4x4_SUB0:
414
415         andi.           L,      K1,     7                                               // K1 & 7 -> L
416
417         KERNEL4x4_SUBI1
418
419         addic.          L,      L,      -1
420         ble             DTRMM_L4x4_SAVE
421         b               DTRMM_L4x4_SUB2
422
423 DTRMM_L4x4_SUB1:
424
425         andi.           L,      K1,     7                                               // K1 & 7 -> L
426         ble             DTRMM_L4x4_SAVE
427
428 DTRMM_L4x4_SUB2:
429
430         KERNEL4x4_SUB1
431
432         addic.          L,      L,      -1
433         bgt             DTRMM_L4x4_SUB2
434
435 DTRMM_L4x4_SAVE:
436
437         SAVE4x4
438
439 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
440         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
441         slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
442         slwi            T1,     T1,     5                       // TEMP1 * Number of values in A shifted -> TEMP1
443         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
444         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
445 #endif
446
447 #if defined(LEFT)
448         addi            KK,     KK,     4                               // KK += Number of values in A
449 #endif
450
451
452 DTRMM_L4x4_END:
453
454 DTRMM_L4x2_BEGIN:
455
456         andi.           T1,     M,      2
457         ble             DTRMM_L4x2_END
458
459 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
460         mr              BO,     B                                       // B -> BO
461 #else
462         mr              BO,     B                                       // B -> BO
463         slwi            T1,     KK,     5                               // Number of values in B shifted
464         slwi            T2,     KK,     4                               // Number of values in A shifted
465         add             BO,     BO,     T1                              // Add values to BO
466         add             AO,     AO,     T2                              // Add values to AO
467 #endif
468
469 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
470         sub             T1,     K,      KK                              // K - KK -> TEMP1
471 #else
472         mr              T1,     KK                                      // KK -> KTEMP
473 #ifdef LEFT
474         addi            T1,     T1,     2                               // KTEMP + Number of values in A -> KTEMP
475 #else
476         addi            T1,     T1,     4                               // KTEMP + Number of values in B -> KTEMP
477 #endif
478 #endif
479
480         mr              KKK,    T1
481         mr              K1,     T1
482         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
483         ble             DTRMM_L4x2_SUB0
484         cmpwi           cr0,    L,      1
485         ble             DTRMM_L4x2_SUB4
486
487 DTRMM_L4x2_LOOP_START:
488
489         LOAD4x2_1
490         KERNEL4x2_I1
491         KERNEL4x2_2
492         KERNEL4x2_1
493         KERNEL4x2_2
494
495         KERNEL4x2_1
496         KERNEL4x2_2
497         KERNEL4x2_1
498         KERNEL4x2_2
499
500         addic.          L,      L,      -2
501         ble             DTRMM_L4x2_LOOP_END
502
503         .align 5
504
505 DTRMM_L4x2_LOOP:
506
507         KERNEL4x2_1
508         KERNEL4x2_2
509         KERNEL4x2_1
510         KERNEL4x2_2
511
512         KERNEL4x2_1
513         KERNEL4x2_2
514         KERNEL4x2_1
515         KERNEL4x2_2
516
517         addic.          L,      L,      -1
518         bgt             DTRMM_L4x2_LOOP
519
520 DTRMM_L4x2_LOOP_END:
521
522         KERNEL4x2_1
523         KERNEL4x2_2
524         KERNEL4x2_1
525         KERNEL4x2_2
526
527         KERNEL4x2_1
528         KERNEL4x2_2
529         KERNEL4x2_1
530         KERNEL4x2_E2
531
532         b               DTRMM_L4x2_SUB1
533
534 DTRMM_L4x2_SUB4:
535
536         KERNEL4x2_SUBI1
537         KERNEL4x2_SUB1
538         KERNEL4x2_SUB1
539         KERNEL4x2_SUB1
540
541         KERNEL4x2_SUB1
542         KERNEL4x2_SUB1
543         KERNEL4x2_SUB1
544         KERNEL4x2_SUB1
545
546         b               DTRMM_L4x2_SUB1
547
548 DTRMM_L4x2_SUB0:
549
550         andi.           L,      K1,     7                                               // K1 & 7 -> L
551
552         KERNEL4x2_SUBI1
553
554         addic.          L,      L,      -1
555         ble             DTRMM_L4x2_SAVE
556         b               DTRMM_L4x2_SUB2
557
558 DTRMM_L4x2_SUB1:
559
560         andi.           L,      K1,     7                                               // K1 & 7 -> L
561         ble             DTRMM_L4x2_SAVE
562
563 DTRMM_L4x2_SUB2:
564
565         KERNEL4x2_SUB1
566
567         addic.          L,      L,      -1
568         bgt             DTRMM_L4x2_SUB2
569
570 DTRMM_L4x2_SAVE:
571
572         SAVE4x2
573
574 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
575         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
576         slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
577         slwi            T1,     T1,     4                       // TEMP1 * Number of values in A shifted -> TEMP1
578         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
579         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
580 #endif
581
582 #if defined(LEFT)
583         addi            KK,     KK,     2                               // KK += Number of values in A
584 #endif
585
586
587 DTRMM_L4x2_END:
588
589 DTRMM_L4x1_BEGIN:
590
591         andi.           T1,     M,      1
592         ble             DTRMM_L4x1_END
593
594 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
595         mr              BO,     B                                       // B -> BO
596 #else
597         mr              BO,     B                                       // B -> BO
598         slwi            T1,     KK,     5                               // Number of values in B shifted
599         slwi            T2,     KK,     3                               // Number of values in A shifted
600         add             BO,     BO,     T1                              // Add values to BO
601         add             AO,     AO,     T2                              // Add values to AO
602 #endif
603
604 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
605         sub             T1,     K,      KK                              // K - KK -> TEMP1
606 #else
607         mr              T1,     KK                                      // KK -> KTEMP
608 #ifdef LEFT
609         addi            T1,     T1,     1                               // KTEMP + Number of values in A -> KTEMP
610 #else
611         addi            T1,     T1,     4                               // KTEMP + Number of values in B -> KTEMP
612 #endif
613 #endif
614
615         mr              KKK,    T1
616         mr              K1,     T1
617         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
618         ble             DTRMM_L4x1_SUB0
619         cmpwi           cr0,    L,      1
620         ble             DTRMM_L4x1_SUB4
621
622 DTRMM_L4x1_LOOP_START:
623
624         LOAD4x1_1
625         KERNEL4x1_I1
626         KERNEL4x1_2
627         KERNEL4x1_1
628         KERNEL4x1_2
629
630         KERNEL4x1_1
631         KERNEL4x1_2
632         KERNEL4x1_1
633         KERNEL4x1_2
634
635         addic.          L,      L,      -2
636         ble             DTRMM_L4x1_LOOP_END
637
638         .align 5
639
640 DTRMM_L4x1_LOOP:
641
642         KERNEL4x1_1
643         KERNEL4x1_2
644         KERNEL4x1_1
645         KERNEL4x1_2
646
647         KERNEL4x1_1
648         KERNEL4x1_2
649         KERNEL4x1_1
650         KERNEL4x1_2
651
652         addic.          L,      L,      -1
653         bgt             DTRMM_L4x1_LOOP
654
655 DTRMM_L4x1_LOOP_END:
656
657         KERNEL4x1_1
658         KERNEL4x1_2
659         KERNEL4x1_1
660         KERNEL4x1_2
661
662         KERNEL4x1_1
663         KERNEL4x1_2
664         KERNEL4x1_1
665         KERNEL4x1_E2
666
667         b               DTRMM_L4x1_SUB1
668
669 DTRMM_L4x1_SUB4:
670
671         KERNEL4x1_SUBI1
672         KERNEL4x1_SUB1
673         KERNEL4x1_SUB1
674         KERNEL4x1_SUB1
675
676         KERNEL4x1_SUB1
677         KERNEL4x1_SUB1
678         KERNEL4x1_SUB1
679         KERNEL4x1_SUB1
680
681         b               DTRMM_L4x1_SUB1
682
683 DTRMM_L4x1_SUB0:
684
685         andi.           L,      K1,     7                                               // K1 & 7 -> L
686
687         KERNEL4x1_SUBI1
688
689         addic.          L,      L,      -1
690         ble             DTRMM_L4x1_SAVE
691         b               DTRMM_L4x1_SUB2
692
693 DTRMM_L4x1_SUB1:
694
695         andi.           L,      K1,     7                                               // K1 & 7 -> L
696         ble             DTRMM_L4x1_SAVE
697
698 DTRMM_L4x1_SUB2:
699
700         KERNEL4x1_SUB1
701
702         addic.          L,      L,      -1
703         bgt             DTRMM_L4x1_SUB2
704
705 DTRMM_L4x1_SAVE:
706
707         SAVE4x1
708
709 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
710         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
711         slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
712         slwi            T1,     T1,     3                       // TEMP1 * Number of values in A shifted -> TEMP1
713         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
714         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
715 #endif
716
717 #if defined(LEFT)
718         addi            KK,     KK,     1                               // KK += Number of values in A
719 #endif
720
721
722 DTRMM_L4x1_END:
723
724         slwi            T1,     K,      5
725         add             B,      B,      T1
726
727 #if !defined(LEFT)
728         addi            KK,     KK,     4                                       // KK += Number of values in B
729 #endif
730
731
732         addic.          J,      J,      -1
733         bgt             DTRMM_L4_BEGIN
734
735         andi.           T2,     N,      3
736         ble             L999
737
738 DTRMM_L4_END:
739
740         b               DTRMM_L2_BEGIN
741
742 L999_H1:
743
744         b               L999
745
746 DTRMM_L2_BEGIN:
747
748         andi.           T1,     N,      2
749         ble             DTRMM_L2_END
750         mr              CO,     C
751         mr              AO,     A
752         slwi            T1,     LDC     ,       1
753         add             C,      C,      T1
754
755 #if defined(LEFT)
756         mr              KK,     OFFSET          // OFFSET -> KK
757 #endif
758
759         srawi.          I,      M,      4
760         ble             DTRMM_L2x16_END
761
762 DTRMM_L2x16_BEGIN:
763
764
765 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
766         mr              BO,     B                                       // B -> BO
767 #else
768         mr              BO,     B                                       // B -> BO
769         slwi            T1,     KK,     4                               // Number of values in B shifted
770         slwi            T2,     KK,     7                               // Number of values in A shifted
771         add             BO,     BO,     T1                              // Add values to BO
772         add             AO,     AO,     T2                              // Add values to AO
773 #endif
774
775 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
776         sub             T1,     K,      KK                              // K - KK -> TEMP1
777 #else
778         mr              T1,     KK                                      // KK -> KTEMP
779 #ifdef LEFT
780         addi            T1,     T1,     16                              // KTEMP + Number of values in A -> KTEMP
781 #else
782         addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
783 #endif
784 #endif
785
786         mr              KKK,    T1
787         mr              K1,     T1
788         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
789         ble             DTRMM_L2x16_SUB0
790         cmpwi           cr0,    L,      1
791         ble             DTRMM_L2x16_SUB4
792
793 DTRMM_L2x16_LOOP_START:
794
795         dcbt            AO,     PRE
796         LOAD2x16_1
797         dcbt            AO,     PRE
798         KERNEL2x16_I1
799         dcbt            AO,     PRE
800         KERNEL2x16_2
801         dcbt            AO,     PRE
802         KERNEL2x16_1
803         dcbt            AO,     PRE
804         KERNEL2x16_2
805
806         dcbt            AO,     PRE
807         KERNEL2x16_1
808         dcbt            AO,     PRE
809         KERNEL2x16_2
810         dcbt            AO,     PRE
811         KERNEL2x16_1
812         dcbt            AO,     PRE
813         KERNEL2x16_2
814
815         addic.          L,      L,      -2
816         ble             DTRMM_L2x16_LOOP_END
817
818         .align 5
819
820 DTRMM_L2x16_LOOP:
821
822         dcbt            AO,     PRE
823         KERNEL2x16_1
824         dcbt            AO,     PRE
825         KERNEL2x16_2
826         dcbt            AO,     PRE
827         KERNEL2x16_1
828         dcbt            AO,     PRE
829         KERNEL2x16_2
830
831         dcbt            AO,     PRE
832         KERNEL2x16_1
833         dcbt            AO,     PRE
834         KERNEL2x16_2
835         dcbt            AO,     PRE
836         KERNEL2x16_1
837         dcbt            AO,     PRE
838         KERNEL2x16_2
839
840         addic.          L,      L,      -1
841         bgt             DTRMM_L2x16_LOOP
842
843 DTRMM_L2x16_LOOP_END:
844
845         dcbt            AO,     PRE
846         KERNEL2x16_1
847         dcbt            AO,     PRE
848         KERNEL2x16_2
849         dcbt            AO,     PRE
850         KERNEL2x16_1
851         dcbt            AO,     PRE
852         KERNEL2x16_2
853
854         dcbt            AO,     PRE
855         KERNEL2x16_1
856         dcbt            AO,     PRE
857         KERNEL2x16_2
858         dcbt            AO,     PRE
859         KERNEL2x16_1
860         KERNEL2x16_E2
861
862         b               DTRMM_L2x16_SUB1
863
864 DTRMM_L2x16_SUB4:
865
866         dcbt            AO,     PRE
867         KERNEL2x16_SUBI1
868         dcbt            AO,     PRE
869         KERNEL2x16_SUB1
870         dcbt            AO,     PRE
871         KERNEL2x16_SUB1
872         dcbt            AO,     PRE
873         KERNEL2x16_SUB1
874
875         KERNEL2x16_SUB1
876         KERNEL2x16_SUB1
877         KERNEL2x16_SUB1
878         KERNEL2x16_SUB1
879
880         b               DTRMM_L2x16_SUB1
881
882 DTRMM_L2x16_SUB0:
883
884         andi.           L,      K1,     7                                               // K1 & 7 -> L
885
886         KERNEL2x16_SUBI1
887
888         addic.          L,      L,      -1
889         ble             DTRMM_L2x16_SAVE
890         b               DTRMM_L2x16_SUB2
891
892 DTRMM_L2x16_SUB1:
893
894         andi.           L,      K1,     7                                               // K1 & 7 -> L
895         ble             DTRMM_L2x16_SAVE
896
897 DTRMM_L2x16_SUB2:
898
899         KERNEL2x16_SUB1
900
901         addic.          L,      L,      -1
902         bgt             DTRMM_L2x16_SUB2
903
904 DTRMM_L2x16_SAVE:
905
906         SAVE2x16
907
908 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
909         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
910         slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
911         slwi            T1,     T1,     7                       // TEMP1 * Number of values in A shifted -> TEMP1
912         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
913         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
914 #endif
915
916 #if defined(LEFT)
917         addi            KK,     KK,     16                              // KK += Number of values in A
918 #endif
919
920
921         addic.          I,      I,      -1
922         bgt             DTRMM_L2x16_BEGIN
923
924 DTRMM_L2x16_END:
925
926 DTRMM_L2x8_BEGIN:
927         andi.           T2,     M,      15
928         ble             DTRMM_L2x1_END
929
930         andi.           T1,     M,      8
931         ble             DTRMM_L2x8_END
932
933 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
934         mr              BO,     B                                       // B -> BO
935 #else
936         mr              BO,     B                                       // B -> BO
937         slwi            T1,     KK,     4                               // Number of values in B shifted
938         slwi            T2,     KK,     6                               // Number of values in A shifted
939         add             BO,     BO,     T1                              // Add values to BO
940         add             AO,     AO,     T2                              // Add values to AO
941 #endif
942
943 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
944         sub             T1,     K,      KK                              // K - KK -> TEMP1
945 #else
946         mr              T1,     KK                                      // KK -> KTEMP
947 #ifdef LEFT
948         addi            T1,     T1,     8                               // KTEMP + Number of values in A -> KTEMP
949 #else
950         addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
951 #endif
952 #endif
953
954         mr              KKK,    T1
955         mr              K1,     T1
956         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
957         ble             DTRMM_L2x8_SUB0
958         cmpwi           cr0,    L,      1
959         ble             DTRMM_L2x8_SUB4
960
961 DTRMM_L2x8_LOOP_START:
962
963         LOAD2x8_1
964         KERNEL2x8_I1
965         KERNEL2x8_2
966         KERNEL2x8_1
967         KERNEL2x8_2
968
969         KERNEL2x8_1
970         KERNEL2x8_2
971         KERNEL2x8_1
972         KERNEL2x8_2
973
974         addic.          L,      L,      -2
975         ble             DTRMM_L2x8_LOOP_END
976
977         .align 5
978
979 DTRMM_L2x8_LOOP:
980
981         KERNEL2x8_1
982         KERNEL2x8_2
983         KERNEL2x8_1
984         KERNEL2x8_2
985
986         KERNEL2x8_1
987         KERNEL2x8_2
988         KERNEL2x8_1
989         KERNEL2x8_2
990
991         addic.          L,      L,      -1
992         bgt             DTRMM_L2x8_LOOP
993
994 DTRMM_L2x8_LOOP_END:
995
996         KERNEL2x8_1
997         KERNEL2x8_2
998         KERNEL2x8_1
999         KERNEL2x8_2
1000
1001         KERNEL2x8_1
1002         KERNEL2x8_2
1003         KERNEL2x8_1
1004         KERNEL2x8_E2
1005
1006         b               DTRMM_L2x8_SUB1
1007
1008 DTRMM_L2x8_SUB4:
1009
1010         KERNEL2x8_SUBI1
1011         KERNEL2x8_SUB1
1012         KERNEL2x8_SUB1
1013         KERNEL2x8_SUB1
1014
1015         KERNEL2x8_SUB1
1016         KERNEL2x8_SUB1
1017         KERNEL2x8_SUB1
1018         KERNEL2x8_SUB1
1019
1020         b               DTRMM_L2x8_SUB1
1021
1022 DTRMM_L2x8_SUB0:
1023
1024         andi.           L,      K1,     7                                               // K1 & 7 -> L
1025
1026         KERNEL2x8_SUBI1
1027
1028         addic.          L,      L,      -1
1029         ble             DTRMM_L2x8_SAVE
1030         b               DTRMM_L2x8_SUB2
1031
1032 DTRMM_L2x8_SUB1:
1033
1034         andi.           L,      K1,     7                                               // K1 & 7 -> L
1035         ble             DTRMM_L2x8_SAVE
1036
1037 DTRMM_L2x8_SUB2:
1038
1039         KERNEL2x8_SUB1
1040
1041         addic.          L,      L,      -1
1042         bgt             DTRMM_L2x8_SUB2
1043
1044 DTRMM_L2x8_SAVE:
1045
1046         SAVE2x8
1047
1048 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1049         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
1050         slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
1051         slwi            T1,     T1,     6                       // TEMP1 * Number of values in A shifted -> TEMP1
1052         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
1053         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
1054 #endif
1055
1056 #if defined(LEFT)
1057         addi            KK,     KK,     8                               // KK += Number of values in A
1058 #endif
1059
1060
1061 DTRMM_L2x8_END:
1062
1063 DTRMM_L2x4_BEGIN:
1064
1065         andi.           T1,     M,      4
1066         ble             DTRMM_L2x4_END
1067
1068 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1069         mr              BO,     B                                       // B -> BO
1070 #else
1071         mr              BO,     B                                       // B -> BO
1072         slwi            T1,     KK,     4                               // Number of values in B shifted
1073         slwi            T2,     KK,     5                               // Number of values in A shifted
1074         add             BO,     BO,     T1                              // Add values to BO
1075         add             AO,     AO,     T2                              // Add values to AO
1076 #endif
1077
1078 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1079         sub             T1,     K,      KK                              // K - KK -> TEMP1
1080 #else
1081         mr              T1,     KK                                      // KK -> KTEMP
1082 #ifdef LEFT
1083         addi            T1,     T1,     4                               // KTEMP + Number of values in A -> KTEMP
1084 #else
1085         addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
1086 #endif
1087 #endif
1088
1089         mr              KKK,    T1
1090         mr              K1,     T1
1091         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
1092         ble             DTRMM_L2x4_SUB0
1093         cmpwi           cr0,    L,      1
1094         ble             DTRMM_L2x4_SUB4
1095
1096 DTRMM_L2x4_LOOP_START:
1097
1098         LOAD2x4_1
1099         KERNEL2x4_I1
1100         KERNEL2x4_2
1101         KERNEL2x4_1
1102         KERNEL2x4_2
1103
1104         KERNEL2x4_1
1105         KERNEL2x4_2
1106         KERNEL2x4_1
1107         KERNEL2x4_2
1108
1109         addic.          L,      L,      -2
1110         ble             DTRMM_L2x4_LOOP_END
1111
1112         .align 5
1113
1114 DTRMM_L2x4_LOOP:
1115
1116         KERNEL2x4_1
1117         KERNEL2x4_2
1118         KERNEL2x4_1
1119         KERNEL2x4_2
1120
1121         KERNEL2x4_1
1122         KERNEL2x4_2
1123         KERNEL2x4_1
1124         KERNEL2x4_2
1125
1126         addic.          L,      L,      -1
1127         bgt             DTRMM_L2x4_LOOP
1128
1129 DTRMM_L2x4_LOOP_END:
1130
1131         KERNEL2x4_1
1132         KERNEL2x4_2
1133         KERNEL2x4_1
1134         KERNEL2x4_2
1135
1136         KERNEL2x4_1
1137         KERNEL2x4_2
1138         KERNEL2x4_1
1139         KERNEL2x4_E2
1140
1141         b               DTRMM_L2x4_SUB1
1142
1143 DTRMM_L2x4_SUB4:
1144
1145         KERNEL2x4_SUBI1
1146         KERNEL2x4_SUB1
1147         KERNEL2x4_SUB1
1148         KERNEL2x4_SUB1
1149
1150         KERNEL2x4_SUB1
1151         KERNEL2x4_SUB1
1152         KERNEL2x4_SUB1
1153         KERNEL2x4_SUB1
1154
1155         b               DTRMM_L2x4_SUB1
1156
1157 DTRMM_L2x4_SUB0:
1158
1159         andi.           L,      K1,     7                                               // K1 & 7 -> L
1160
1161         KERNEL2x4_SUBI1
1162
1163         addic.          L,      L,      -1
1164         ble             DTRMM_L2x4_SAVE
1165         b               DTRMM_L2x4_SUB2
1166
1167 DTRMM_L2x4_SUB1:
1168
1169         andi.           L,      K1,     7                                               // K1 & 7 -> L
1170         ble             DTRMM_L2x4_SAVE
1171
1172 DTRMM_L2x4_SUB2:
1173
1174         KERNEL2x4_SUB1
1175
1176         addic.          L,      L,      -1
1177         bgt             DTRMM_L2x4_SUB2
1178
1179 DTRMM_L2x4_SAVE:
1180
1181         SAVE2x4
1182
1183 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1184         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
1185         slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
1186         slwi            T1,     T1,     5                       // TEMP1 * Number of values in A shifted -> TEMP1
1187         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
1188         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
1189 #endif
1190
1191 #if defined(LEFT)
1192         addi            KK,     KK,     4                               // KK += Number of values in A
1193 #endif
1194
1195
1196 DTRMM_L2x4_END:
1197
1198 DTRMM_L2x2_BEGIN:
1199
1200         andi.           T1,     M,      2
1201         ble             DTRMM_L2x2_END
1202
1203 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1204         mr              BO,     B                                       // B -> BO
1205 #else
1206         mr              BO,     B                                       // B -> BO
1207         slwi            T1,     KK,     4                               // Number of values in B shifted
1208         slwi            T2,     KK,     4                               // Number of values in A shifted
1209         add             BO,     BO,     T1                              // Add values to BO
1210         add             AO,     AO,     T2                              // Add values to AO
1211 #endif
1212
1213 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1214         sub             T1,     K,      KK                              // K - KK -> TEMP1
1215 #else
1216         mr              T1,     KK                                      // KK -> KTEMP
1217 #ifdef LEFT
1218         addi            T1,     T1,     2                               // KTEMP + Number of values in A -> KTEMP
1219 #else
1220         addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
1221 #endif
1222 #endif
1223
1224         mr              KKK,    T1
1225         mr              K1,     T1
1226         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
1227         ble             DTRMM_L2x2_SUB0
1228         cmpwi           cr0,    L,      1
1229         ble             DTRMM_L2x2_SUB4
1230
1231 DTRMM_L2x2_LOOP_START:
1232
1233         LOAD2x2_1
1234         KERNEL2x2_I1
1235         KERNEL2x2_2
1236         KERNEL2x2_1
1237         KERNEL2x2_2
1238
1239         KERNEL2x2_1
1240         KERNEL2x2_2
1241         KERNEL2x2_1
1242         KERNEL2x2_2
1243
1244         addic.          L,      L,      -2
1245         ble             DTRMM_L2x2_LOOP_END
1246
1247         .align 5
1248
1249 DTRMM_L2x2_LOOP:
1250
1251         KERNEL2x2_1
1252         KERNEL2x2_2
1253         KERNEL2x2_1
1254         KERNEL2x2_2
1255
1256         KERNEL2x2_1
1257         KERNEL2x2_2
1258         KERNEL2x2_1
1259         KERNEL2x2_2
1260
1261         addic.          L,      L,      -1
1262         bgt             DTRMM_L2x2_LOOP
1263
1264 DTRMM_L2x2_LOOP_END:
1265
1266         KERNEL2x2_1
1267         KERNEL2x2_2
1268         KERNEL2x2_1
1269         KERNEL2x2_2
1270
1271         KERNEL2x2_1
1272         KERNEL2x2_2
1273         KERNEL2x2_1
1274         KERNEL2x2_E2
1275
1276         b               DTRMM_L2x2_SUB1
1277
1278 DTRMM_L2x2_SUB4:
1279
1280         KERNEL2x2_SUBI1
1281         KERNEL2x2_SUB1
1282         KERNEL2x2_SUB1
1283         KERNEL2x2_SUB1
1284
1285         KERNEL2x2_SUB1
1286         KERNEL2x2_SUB1
1287         KERNEL2x2_SUB1
1288         KERNEL2x2_SUB1
1289
1290         b               DTRMM_L2x2_SUB1
1291
1292 DTRMM_L2x2_SUB0:
1293
1294         andi.           L,      K1,     7                                               // K1 & 7 -> L
1295
1296         KERNEL2x2_SUBI1
1297
1298         addic.          L,      L,      -1
1299         ble             DTRMM_L2x2_SAVE
1300         b               DTRMM_L2x2_SUB2
1301
1302 DTRMM_L2x2_SUB1:
1303
1304         andi.           L,      K1,     7                                               // K1 & 7 -> L
1305         ble             DTRMM_L2x2_SAVE
1306
1307 DTRMM_L2x2_SUB2:
1308
1309         KERNEL2x2_SUB1
1310
1311         addic.          L,      L,      -1
1312         bgt             DTRMM_L2x2_SUB2
1313
1314 DTRMM_L2x2_SAVE:
1315
1316         SAVE2x2
1317
1318 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1319         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
1320         slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
1321         slwi            T1,     T1,     4                       // TEMP1 * Number of values in A shifted -> TEMP1
1322         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
1323         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
1324 #endif
1325
1326 #if defined(LEFT)
1327         addi            KK,     KK,     2                               // KK += Number of values in A
1328 #endif
1329
1330
1331 DTRMM_L2x2_END:
1332
1333 DTRMM_L2x1_BEGIN:
1334
1335         andi.           T1,     M,      1
1336         ble             DTRMM_L2x1_END
1337
1338 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1339         mr              BO,     B                                       // B -> BO
1340 #else
1341         mr              BO,     B                                       // B -> BO
1342         slwi            T1,     KK,     4                               // Number of values in B shifted
1343         slwi            T2,     KK,     3                               // Number of values in A shifted
1344         add             BO,     BO,     T1                              // Add values to BO
1345         add             AO,     AO,     T2                              // Add values to AO
1346 #endif
1347
1348 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1349         sub             T1,     K,      KK                              // K - KK -> TEMP1
1350 #else
1351         mr              T1,     KK                                      // KK -> KTEMP
1352 #ifdef LEFT
1353         addi            T1,     T1,     1                               // KTEMP + Number of values in A -> KTEMP
1354 #else
1355         addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
1356 #endif
1357 #endif
1358
1359         mr              KKK,    T1
1360         mr              K1,     T1
1361         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
1362         ble             DTRMM_L2x1_SUB0
1363         cmpwi           cr0,    L,      1
1364         ble             DTRMM_L2x1_SUB4
1365
1366 DTRMM_L2x1_LOOP_START:
1367
1368         LOAD2x1_1
1369         KERNEL2x1_I1
1370         KERNEL2x1_2
1371         KERNEL2x1_1
1372         KERNEL2x1_2
1373
1374         KERNEL2x1_1
1375         KERNEL2x1_2
1376         KERNEL2x1_1
1377         KERNEL2x1_2
1378
1379         addic.          L,      L,      -2
1380         ble             DTRMM_L2x1_LOOP_END
1381
1382         .align 5
1383
1384 DTRMM_L2x1_LOOP:
1385
1386         KERNEL2x1_1
1387         KERNEL2x1_2
1388         KERNEL2x1_1
1389         KERNEL2x1_2
1390
1391         KERNEL2x1_1
1392         KERNEL2x1_2
1393         KERNEL2x1_1
1394         KERNEL2x1_2
1395
1396         addic.          L,      L,      -1
1397         bgt             DTRMM_L2x1_LOOP
1398
1399 DTRMM_L2x1_LOOP_END:
1400
1401         KERNEL2x1_1
1402         KERNEL2x1_2
1403         KERNEL2x1_1
1404         KERNEL2x1_2
1405
1406         KERNEL2x1_1
1407         KERNEL2x1_2
1408         KERNEL2x1_1
1409         KERNEL2x1_E2
1410
1411         b               DTRMM_L2x1_SUB1
1412
1413 DTRMM_L2x1_SUB4:
1414
1415         KERNEL2x1_SUBI1
1416         KERNEL2x1_SUB1
1417         KERNEL2x1_SUB1
1418         KERNEL2x1_SUB1
1419
1420         KERNEL2x1_SUB1
1421         KERNEL2x1_SUB1
1422         KERNEL2x1_SUB1
1423         KERNEL2x1_SUB1
1424
1425         b               DTRMM_L2x1_SUB1
1426
1427 DTRMM_L2x1_SUB0:
1428
1429         andi.           L,      K1,     7                                               // K1 & 7 -> L
1430
1431         KERNEL2x1_SUBI1
1432
1433         addic.          L,      L,      -1
1434         ble             DTRMM_L2x1_SAVE
1435         b               DTRMM_L2x1_SUB2
1436
1437 DTRMM_L2x1_SUB1:
1438
1439         andi.           L,      K1,     7                                               // K1 & 7 -> L
1440         ble             DTRMM_L2x1_SAVE
1441
1442 DTRMM_L2x1_SUB2:
1443
1444         KERNEL2x1_SUB1
1445
1446         addic.          L,      L,      -1
1447         bgt             DTRMM_L2x1_SUB2
1448
1449 DTRMM_L2x1_SAVE:
1450
1451         SAVE2x1
1452
1453 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1454         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
1455         slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
1456         slwi            T1,     T1,     3                       // TEMP1 * Number of values in A shifted -> TEMP1
1457         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
1458         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
1459 #endif
1460
1461 #if defined(LEFT)
1462         addi            KK,     KK,     1                               // KK += Number of values in A
1463 #endif
1464
1465
1466 DTRMM_L2x1_END:
1467
1468         slwi            T1,     K,      4
1469         add             B,      B,      T1
1470
1471 #if !defined(LEFT)
1472         addi            KK,     KK,     2                                       // KK += Number of values in B
1473 #endif
1474
1475
1476 DTRMM_L2_END:
1477 DTRMM_L1_BEGIN:
1478
1479         andi.           T1,     N,      1
1480         ble             DTRMM_L1_END
1481         mr              CO,     C
1482         mr              AO,     A
1483
1484 #if defined(LEFT)
1485         mr              KK,     OFFSET          // OFFSET -> KK
1486 #endif
1487
1488         srawi.          I,      M,      4
1489         ble             DTRMM_L1x16_END
1490
1491 DTRMM_L1x16_BEGIN:
1492
1493
1494 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1495         mr              BO,     B                                       // B -> BO
1496 #else
1497         mr              BO,     B                                       // B -> BO
1498         slwi            T1,     KK,     3                               // Number of values in B shifted
1499         slwi            T2,     KK,     7                               // Number of values in A shifted
1500         add             BO,     BO,     T1                              // Add values to BO
1501         add             AO,     AO,     T2                              // Add values to AO
1502 #endif
1503
1504 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1505         sub             T1,     K,      KK                              // K - KK -> TEMP1
1506 #else
1507         mr              T1,     KK                                      // KK -> KTEMP
1508 #ifdef LEFT
1509         addi            T1,     T1,     16                              // KTEMP + Number of values in A -> KTEMP
1510 #else
1511         addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
1512 #endif
1513 #endif
1514
1515         mr              KKK,    T1
1516         mr              K1,     T1
1517         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
1518         ble             DTRMM_L1x16_SUB0
1519         cmpwi           cr0,    L,      1
1520         ble             DTRMM_L1x16_SUB4
1521
1522 DTRMM_L1x16_LOOP_START:
1523
1524         dcbt            AO,     PRE
1525         LOAD1x16_1
1526         dcbt            AO,     PRE
1527         KERNEL1x16_I1
1528         dcbt            AO,     PRE
1529         KERNEL1x16_2
1530         dcbt            AO,     PRE
1531         KERNEL1x16_1
1532         dcbt            AO,     PRE
1533         KERNEL1x16_2
1534
1535         dcbt            AO,     PRE
1536         KERNEL1x16_1
1537         dcbt            AO,     PRE
1538         KERNEL1x16_2
1539         dcbt            AO,     PRE
1540         KERNEL1x16_1
1541         dcbt            AO,     PRE
1542         KERNEL1x16_2
1543
1544         addic.          L,      L,      -2
1545         ble             DTRMM_L1x16_LOOP_END
1546
1547         .align 5
1548
1549 DTRMM_L1x16_LOOP:
1550
1551         dcbt            AO,     PRE
1552         KERNEL1x16_1
1553         dcbt            AO,     PRE
1554         KERNEL1x16_2
1555         dcbt            AO,     PRE
1556         KERNEL1x16_1
1557         dcbt            AO,     PRE
1558         KERNEL1x16_2
1559
1560         dcbt            AO,     PRE
1561         KERNEL1x16_1
1562         dcbt            AO,     PRE
1563         KERNEL1x16_2
1564         dcbt            AO,     PRE
1565         KERNEL1x16_1
1566         dcbt            AO,     PRE
1567         KERNEL1x16_2
1568
1569         addic.          L,      L,      -1
1570         bgt             DTRMM_L1x16_LOOP
1571
1572 DTRMM_L1x16_LOOP_END:
1573
1574         dcbt            AO,     PRE
1575         KERNEL1x16_1
1576         dcbt            AO,     PRE
1577         KERNEL1x16_2
1578         dcbt            AO,     PRE
1579         KERNEL1x16_1
1580         dcbt            AO,     PRE
1581         KERNEL1x16_2
1582
1583         dcbt            AO,     PRE
1584         KERNEL1x16_1
1585         dcbt            AO,     PRE
1586         KERNEL1x16_2
1587         dcbt            AO,     PRE
1588         KERNEL1x16_1
1589         KERNEL1x16_E2
1590
1591         b               DTRMM_L1x16_SUB1
1592
1593 DTRMM_L1x16_SUB4:
1594
1595         dcbt            AO,     PRE
1596         KERNEL1x16_SUBI1
1597         dcbt            AO,     PRE
1598         KERNEL1x16_SUB1
1599         dcbt            AO,     PRE
1600         KERNEL1x16_SUB1
1601         dcbt            AO,     PRE
1602         KERNEL1x16_SUB1
1603
1604         KERNEL1x16_SUB1
1605         KERNEL1x16_SUB1
1606         KERNEL1x16_SUB1
1607         KERNEL1x16_SUB1
1608
1609         b               DTRMM_L1x16_SUB1
1610
1611 DTRMM_L1x16_SUB0:
1612
1613         andi.           L,      K1,     7                                               // K1 & 7 -> L
1614
1615         KERNEL1x16_SUBI1
1616
1617         addic.          L,      L,      -1
1618         ble             DTRMM_L1x16_SAVE
1619         b               DTRMM_L1x16_SUB2
1620
1621 DTRMM_L1x16_SUB1:
1622
1623         andi.           L,      K1,     7                                               // K1 & 7 -> L
1624         ble             DTRMM_L1x16_SAVE
1625
1626 DTRMM_L1x16_SUB2:
1627
1628         KERNEL1x16_SUB1
1629
1630         addic.          L,      L,      -1
1631         bgt             DTRMM_L1x16_SUB2
1632
1633 DTRMM_L1x16_SAVE:
1634
1635         SAVE1x16
1636
1637 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1638         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
1639         slwi            T2,     T1,     3                       // TEMP1 * Number of values in B shifted -> TEMP2
1640         slwi            T1,     T1,     7                       // TEMP1 * Number of values in A shifted -> TEMP1
1641         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
1642         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
1643 #endif
1644
1645 #if defined(LEFT)
1646         addi            KK,     KK,     16                              // KK += Number of values in A
1647 #endif
1648
1649
1650         addic.          I,      I,      -1
1651         bgt             DTRMM_L1x16_BEGIN
1652
1653 DTRMM_L1x16_END:
1654
1655 DTRMM_L1x8_BEGIN:
1656         andi.           T2,     M,      15
1657         ble             DTRMM_L1x1_END
1658
1659         andi.           T1,     M,      8
1660         ble             DTRMM_L1x8_END
1661
1662 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1663         mr              BO,     B                                       // B -> BO
1664 #else
1665         mr              BO,     B                                       // B -> BO
1666         slwi            T1,     KK,     3                               // Number of values in B shifted
1667         slwi            T2,     KK,     6                               // Number of values in A shifted
1668         add             BO,     BO,     T1                              // Add values to BO
1669         add             AO,     AO,     T2                              // Add values to AO
1670 #endif
1671
1672 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1673         sub             T1,     K,      KK                              // K - KK -> TEMP1
1674 #else
1675         mr              T1,     KK                                      // KK -> KTEMP
1676 #ifdef LEFT
1677         addi            T1,     T1,     8                               // KTEMP + Number of values in A -> KTEMP
1678 #else
1679         addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
1680 #endif
1681 #endif
1682
1683         mr              KKK,    T1
1684         mr              K1,     T1
1685         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
1686         ble             DTRMM_L1x8_SUB0
1687         cmpwi           cr0,    L,      1
1688         ble             DTRMM_L1x8_SUB4
1689
1690 DTRMM_L1x8_LOOP_START:
1691
1692         LOAD1x8_1
1693         KERNEL1x8_I1
1694         KERNEL1x8_2
1695         KERNEL1x8_1
1696         KERNEL1x8_2
1697
1698         KERNEL1x8_1
1699         KERNEL1x8_2
1700         KERNEL1x8_1
1701         KERNEL1x8_2
1702
1703         addic.          L,      L,      -2
1704         ble             DTRMM_L1x8_LOOP_END
1705
1706         .align 5
1707
1708 DTRMM_L1x8_LOOP:
1709
1710         KERNEL1x8_1
1711         KERNEL1x8_2
1712         KERNEL1x8_1
1713         KERNEL1x8_2
1714
1715         KERNEL1x8_1
1716         KERNEL1x8_2
1717         KERNEL1x8_1
1718         KERNEL1x8_2
1719
1720         addic.          L,      L,      -1
1721         bgt             DTRMM_L1x8_LOOP
1722
1723 DTRMM_L1x8_LOOP_END:
1724
1725         KERNEL1x8_1
1726         KERNEL1x8_2
1727         KERNEL1x8_1
1728         KERNEL1x8_2
1729
1730         KERNEL1x8_1
1731         KERNEL1x8_2
1732         KERNEL1x8_1
1733         KERNEL1x8_E2
1734
1735         b               DTRMM_L1x8_SUB1
1736
1737 DTRMM_L1x8_SUB4:
1738
1739         KERNEL1x8_SUBI1
1740         KERNEL1x8_SUB1
1741         KERNEL1x8_SUB1
1742         KERNEL1x8_SUB1
1743
1744         KERNEL1x8_SUB1
1745         KERNEL1x8_SUB1
1746         KERNEL1x8_SUB1
1747         KERNEL1x8_SUB1
1748
1749         b               DTRMM_L1x8_SUB1
1750
1751 DTRMM_L1x8_SUB0:
1752
1753         andi.           L,      K1,     7                                               // K1 & 7 -> L
1754
1755         KERNEL1x8_SUBI1
1756
1757         addic.          L,      L,      -1
1758         ble             DTRMM_L1x8_SAVE
1759         b               DTRMM_L1x8_SUB2
1760
1761 DTRMM_L1x8_SUB1:
1762
1763         andi.           L,      K1,     7                                               // K1 & 7 -> L
1764         ble             DTRMM_L1x8_SAVE
1765
1766 DTRMM_L1x8_SUB2:
1767
1768         KERNEL1x8_SUB1
1769
1770         addic.          L,      L,      -1
1771         bgt             DTRMM_L1x8_SUB2
1772
1773 DTRMM_L1x8_SAVE:
1774
1775         SAVE1x8
1776
1777 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1778         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
1779         slwi            T2,     T1,     3                       // TEMP1 * Number of values in B shifted -> TEMP2
1780         slwi            T1,     T1,     6                       // TEMP1 * Number of values in A shifted -> TEMP1
1781         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
1782         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
1783 #endif
1784
1785 #if defined(LEFT)
1786         addi            KK,     KK,     8                               // KK += Number of values in A
1787 #endif
1788
1789
1790 DTRMM_L1x8_END:
1791
1792 DTRMM_L1x4_BEGIN:
1793
1794         andi.           T1,     M,      4
1795         ble             DTRMM_L1x4_END
1796
1797 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1798         mr              BO,     B                                       // B -> BO
1799 #else
1800         mr              BO,     B                                       // B -> BO
1801         slwi            T1,     KK,     3                               // Number of values in B shifted
1802         slwi            T2,     KK,     5                               // Number of values in A shifted
1803         add             BO,     BO,     T1                              // Add values to BO
1804         add             AO,     AO,     T2                              // Add values to AO
1805 #endif
1806
1807 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1808         sub             T1,     K,      KK                              // K - KK -> TEMP1
1809 #else
1810         mr              T1,     KK                                      // KK -> KTEMP
1811 #ifdef LEFT
1812         addi            T1,     T1,     4                               // KTEMP + Number of values in A -> KTEMP
1813 #else
1814         addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
1815 #endif
1816 #endif
1817
1818         mr              KKK,    T1
1819         mr              K1,     T1
1820         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
1821         ble             DTRMM_L1x4_SUB0
1822         cmpwi           cr0,    L,      1
1823         ble             DTRMM_L1x4_SUB4
1824
1825 DTRMM_L1x4_LOOP_START:
1826
1827         LOAD1x4_1
1828         KERNEL1x4_I1
1829         KERNEL1x4_2
1830         KERNEL1x4_1
1831         KERNEL1x4_2
1832
1833         KERNEL1x4_1
1834         KERNEL1x4_2
1835         KERNEL1x4_1
1836         KERNEL1x4_2
1837
1838         addic.          L,      L,      -2
1839         ble             DTRMM_L1x4_LOOP_END
1840
1841         .align 5
1842
1843 DTRMM_L1x4_LOOP:
1844
1845         KERNEL1x4_1
1846         KERNEL1x4_2
1847         KERNEL1x4_1
1848         KERNEL1x4_2
1849
1850         KERNEL1x4_1
1851         KERNEL1x4_2
1852         KERNEL1x4_1
1853         KERNEL1x4_2
1854
1855         addic.          L,      L,      -1
1856         bgt             DTRMM_L1x4_LOOP
1857
1858 DTRMM_L1x4_LOOP_END:
1859
1860         KERNEL1x4_1
1861         KERNEL1x4_2
1862         KERNEL1x4_1
1863         KERNEL1x4_2
1864
1865         KERNEL1x4_1
1866         KERNEL1x4_2
1867         KERNEL1x4_1
1868         KERNEL1x4_E2
1869
1870         b               DTRMM_L1x4_SUB1
1871
1872 DTRMM_L1x4_SUB4:
1873
1874         KERNEL1x4_SUBI1
1875         KERNEL1x4_SUB1
1876         KERNEL1x4_SUB1
1877         KERNEL1x4_SUB1
1878
1879         KERNEL1x4_SUB1
1880         KERNEL1x4_SUB1
1881         KERNEL1x4_SUB1
1882         KERNEL1x4_SUB1
1883
1884         b               DTRMM_L1x4_SUB1
1885
1886 DTRMM_L1x4_SUB0:
1887
1888         andi.           L,      K1,     7                                               // K1 & 7 -> L
1889
1890         KERNEL1x4_SUBI1
1891
1892         addic.          L,      L,      -1
1893         ble             DTRMM_L1x4_SAVE
1894         b               DTRMM_L1x4_SUB2
1895
1896 DTRMM_L1x4_SUB1:
1897
1898         andi.           L,      K1,     7                                               // K1 & 7 -> L
1899         ble             DTRMM_L1x4_SAVE
1900
1901 DTRMM_L1x4_SUB2:
1902
1903         KERNEL1x4_SUB1
1904
1905         addic.          L,      L,      -1
1906         bgt             DTRMM_L1x4_SUB2
1907
1908 DTRMM_L1x4_SAVE:
1909
1910         SAVE1x4
1911
1912 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1913         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
1914         slwi            T2,     T1,     3                       // TEMP1 * Number of values in B shifted -> TEMP2
1915         slwi            T1,     T1,     5                       // TEMP1 * Number of values in A shifted -> TEMP1
1916         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
1917         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
1918 #endif
1919
1920 #if defined(LEFT)
1921         addi            KK,     KK,     4                               // KK += Number of values in A
1922 #endif
1923
1924
1925 DTRMM_L1x4_END:
1926
1927 DTRMM_L1x2_BEGIN:
1928
1929         andi.           T1,     M,      2
1930         ble             DTRMM_L1x2_END
1931
1932 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1933         mr              BO,     B                                       // B -> BO
1934 #else
1935         mr              BO,     B                                       // B -> BO
1936         slwi            T1,     KK,     3                               // Number of values in B shifted
1937         slwi            T2,     KK,     4                               // Number of values in A shifted
1938         add             BO,     BO,     T1                              // Add values to BO
1939         add             AO,     AO,     T2                              // Add values to AO
1940 #endif
1941
1942 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1943         sub             T1,     K,      KK                              // K - KK -> TEMP1
1944 #else
1945         mr              T1,     KK                                      // KK -> KTEMP
1946 #ifdef LEFT
1947         addi            T1,     T1,     2                               // KTEMP + Number of values in A -> KTEMP
1948 #else
1949         addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
1950 #endif
1951 #endif
1952
1953         mr              KKK,    T1
1954         mr              K1,     T1
1955         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
1956         ble             DTRMM_L1x2_SUB0
1957         cmpwi           cr0,    L,      1
1958         ble             DTRMM_L1x2_SUB4
1959
1960 DTRMM_L1x2_LOOP_START:
1961
1962         LOAD1x2_1
1963         KERNEL1x2_I1
1964         KERNEL1x2_2
1965         KERNEL1x2_1
1966         KERNEL1x2_2
1967
1968         KERNEL1x2_1
1969         KERNEL1x2_2
1970         KERNEL1x2_1
1971         KERNEL1x2_2
1972
1973         addic.          L,      L,      -2
1974         ble             DTRMM_L1x2_LOOP_END
1975
1976         .align 5
1977
1978 DTRMM_L1x2_LOOP:
1979
1980         KERNEL1x2_1
1981         KERNEL1x2_2
1982         KERNEL1x2_1
1983         KERNEL1x2_2
1984
1985         KERNEL1x2_1
1986         KERNEL1x2_2
1987         KERNEL1x2_1
1988         KERNEL1x2_2
1989
1990         addic.          L,      L,      -1
1991         bgt             DTRMM_L1x2_LOOP
1992
1993 DTRMM_L1x2_LOOP_END:
1994
1995         KERNEL1x2_1
1996         KERNEL1x2_2
1997         KERNEL1x2_1
1998         KERNEL1x2_2
1999
2000         KERNEL1x2_1
2001         KERNEL1x2_2
2002         KERNEL1x2_1
2003         KERNEL1x2_E2
2004
2005         b               DTRMM_L1x2_SUB1
2006
2007 DTRMM_L1x2_SUB4:
2008
2009         KERNEL1x2_SUBI1
2010         KERNEL1x2_SUB1
2011         KERNEL1x2_SUB1
2012         KERNEL1x2_SUB1
2013
2014         KERNEL1x2_SUB1
2015         KERNEL1x2_SUB1
2016         KERNEL1x2_SUB1
2017         KERNEL1x2_SUB1
2018
2019         b               DTRMM_L1x2_SUB1
2020
2021 DTRMM_L1x2_SUB0:
2022
2023         andi.           L,      K1,     7                                               // K1 & 7 -> L
2024
2025         KERNEL1x2_SUBI1
2026
2027         addic.          L,      L,      -1
2028         ble             DTRMM_L1x2_SAVE
2029         b               DTRMM_L1x2_SUB2
2030
2031 DTRMM_L1x2_SUB1:
2032
2033         andi.           L,      K1,     7                                               // K1 & 7 -> L
2034         ble             DTRMM_L1x2_SAVE
2035
2036 DTRMM_L1x2_SUB2:
2037
2038         KERNEL1x2_SUB1
2039
2040         addic.          L,      L,      -1
2041         bgt             DTRMM_L1x2_SUB2
2042
2043 DTRMM_L1x2_SAVE:
2044
2045         SAVE1x2
2046
2047 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2048         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
2049         slwi            T2,     T1,     3                       // TEMP1 * Number of values in B shifted -> TEMP2
2050         slwi            T1,     T1,     4                       // TEMP1 * Number of values in A shifted -> TEMP1
2051         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
2052         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
2053 #endif
2054
2055 #if defined(LEFT)
2056         addi            KK,     KK,     2                               // KK += Number of values in A
2057 #endif
2058
2059
2060 DTRMM_L1x2_END:
2061
2062 DTRMM_L1x1_BEGIN:
2063
2064         andi.           T1,     M,      1
2065         ble             DTRMM_L1x1_END
2066
2067 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2068         mr              BO,     B                                       // B -> BO
2069 #else
2070         mr              BO,     B                                       // B -> BO
2071         slwi            T1,     KK,     3                               // Number of values in B shifted
2072         slwi            T2,     KK,     3                               // Number of values in A shifted
2073         add             BO,     BO,     T1                              // Add values to BO
2074         add             AO,     AO,     T2                              // Add values to AO
2075 #endif
2076
2077 #if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2078         sub             T1,     K,      KK                              // K - KK -> TEMP1
2079 #else
2080         mr              T1,     KK                                      // KK -> KTEMP
2081 #ifdef LEFT
2082         addi            T1,     T1,     1                               // KTEMP + Number of values in A -> KTEMP
2083 #else
2084         addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
2085 #endif
2086 #endif
2087
2088         mr              KKK,    T1
2089         mr              K1,     T1
2090         srawi.          L,      K1,     3                               // KTEMP / 8 -> L
2091         ble             DTRMM_L1x1_SUB0
2092         cmpwi           cr0,    L,      1
2093         ble             DTRMM_L1x1_SUB4
2094
2095 DTRMM_L1x1_LOOP_START:
2096
2097         LOAD1x1_1
2098         KERNEL1x1_I1
2099         KERNEL1x1_2
2100         KERNEL1x1_1
2101         KERNEL1x1_2
2102
2103         KERNEL1x1_1
2104         KERNEL1x1_2
2105         KERNEL1x1_1
2106         KERNEL1x1_2
2107
2108         addic.          L,      L,      -2
2109         ble             DTRMM_L1x1_LOOP_END
2110
2111         .align 5
2112
2113 DTRMM_L1x1_LOOP:
2114
2115         KERNEL1x1_1
2116         KERNEL1x1_2
2117         KERNEL1x1_1
2118         KERNEL1x1_2
2119
2120         KERNEL1x1_1
2121         KERNEL1x1_2
2122         KERNEL1x1_1
2123         KERNEL1x1_2
2124
2125         addic.          L,      L,      -1
2126         bgt             DTRMM_L1x1_LOOP
2127
2128 DTRMM_L1x1_LOOP_END:
2129
2130         KERNEL1x1_1
2131         KERNEL1x1_2
2132         KERNEL1x1_1
2133         KERNEL1x1_2
2134
2135         KERNEL1x1_1
2136         KERNEL1x1_2
2137         KERNEL1x1_1
2138         KERNEL1x1_E2
2139
2140         b               DTRMM_L1x1_SUB1
2141
2142 DTRMM_L1x1_SUB4:
2143
2144         KERNEL1x1_SUBI1
2145         KERNEL1x1_SUB1
2146         KERNEL1x1_SUB1
2147         KERNEL1x1_SUB1
2148
2149         KERNEL1x1_SUB1
2150         KERNEL1x1_SUB1
2151         KERNEL1x1_SUB1
2152         KERNEL1x1_SUB1
2153
2154         b               DTRMM_L1x1_SUB1
2155
2156 DTRMM_L1x1_SUB0:
2157
2158         andi.           L,      K1,     7                                               // K1 & 7 -> L
2159
2160         KERNEL1x1_SUBI1
2161
2162         addic.          L,      L,      -1
2163         ble             DTRMM_L1x1_SAVE
2164         b               DTRMM_L1x1_SUB2
2165
2166 DTRMM_L1x1_SUB1:
2167
2168         andi.           L,      K1,     7                                               // K1 & 7 -> L
2169         ble             DTRMM_L1x1_SAVE
2170
2171 DTRMM_L1x1_SUB2:
2172
2173         KERNEL1x1_SUB1
2174
2175         addic.          L,      L,      -1
2176         bgt             DTRMM_L1x1_SUB2
2177
2178 DTRMM_L1x1_SAVE:
2179
2180         SAVE1x1
2181
2182 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2183         sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
2184         slwi            T2,     T1,     3                       // TEMP1 * Number of values in B shifted -> TEMP2
2185         slwi            T1,     T1,     3                       // TEMP1 * Number of values in A shifted -> TEMP1
2186         add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
2187         add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
2188 #endif
2189
2190 #if defined(LEFT)
2191         addi            KK,     KK,     1                               // KK += Number of values in A
2192 #endif
2193
2194
2195 DTRMM_L1x1_END:
2196
2197 #if !defined(LEFT)
2198         addi            KK,     KK,     1                                       // KK += Number of values in B
2199 #endif
2200
2201
2202 DTRMM_L1_END: