1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 static __attribute__ ((noinline))
32 void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
34 v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
35 v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
36 v2f64 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13;
37 v2f64 src_b14, src_b15;
38 FLOAT *c_nxt1line = c + ldc;
39 FLOAT *c_nxt2line = c + 2 * ldc;
40 FLOAT *c_nxt3line = c + 3 * ldc;
42 LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
43 LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7);
44 LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
45 LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15);
49 BLASLONG i, pref_offset;
50 FLOAT *pba = a, *pbb = b, *pa0_pref;
51 v2f64 src_b, src_b0, src_b1, src_a0, src_a1, src_a2, src_a3;
53 pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1);
57 pref_offset = L1_DATA_LINESIZE - pref_offset;
58 pref_offset = pref_offset / sizeof(FLOAT);
61 pa0_pref = a + pref_offset;
63 for (i = (bk >> 1); i--;)
65 PREF_OFFSET(pa0_pref, 128);
66 PREF_OFFSET(pa0_pref, 160);
67 PREF_OFFSET(pa0_pref, 192);
68 PREF_OFFSET(pa0_pref, 224);
70 LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
71 LD_DP2_INC(pbb, 2, src_b0, src_b1);
73 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
74 src_c0 -= src_a0 * src_b;
75 src_c1 -= src_a1 * src_b;
76 src_c2 -= src_a2 * src_b;
77 src_c3 -= src_a3 * src_b;
79 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
80 src_c4 -= src_a0 * src_b;
81 src_c5 -= src_a1 * src_b;
82 src_c6 -= src_a2 * src_b;
83 src_c7 -= src_a3 * src_b;
85 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
86 src_c8 -= src_a0 * src_b;
87 src_c9 -= src_a1 * src_b;
88 src_c10 -= src_a2 * src_b;
89 src_c11 -= src_a3 * src_b;
91 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
92 src_c12 -= src_a0 * src_b;
93 src_c13 -= src_a1 * src_b;
94 src_c14 -= src_a2 * src_b;
95 src_c15 -= src_a3 * src_b;
97 LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
98 LD_DP2_INC(pbb, 2, src_b0, src_b1);
100 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
101 src_c0 -= src_a0 * src_b;
102 src_c1 -= src_a1 * src_b;
103 src_c2 -= src_a2 * src_b;
104 src_c3 -= src_a3 * src_b;
106 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
107 src_c4 -= src_a0 * src_b;
108 src_c5 -= src_a1 * src_b;
109 src_c6 -= src_a2 * src_b;
110 src_c7 -= src_a3 * src_b;
112 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
113 src_c8 -= src_a0 * src_b;
114 src_c9 -= src_a1 * src_b;
115 src_c10 -= src_a2 * src_b;
116 src_c11 -= src_a3 * src_b;
118 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
119 src_c12 -= src_a0 * src_b;
120 src_c13 -= src_a1 * src_b;
121 src_c14 -= src_a2 * src_b;
122 src_c15 -= src_a3 * src_b;
129 LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
130 LD_DP2_INC(pbb, 2, src_b0, src_b1);
132 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
133 src_c0 -= src_a0 * src_b;
134 src_c1 -= src_a1 * src_b;
135 src_c2 -= src_a2 * src_b;
136 src_c3 -= src_a3 * src_b;
138 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
139 src_c4 -= src_a0 * src_b;
140 src_c5 -= src_a1 * src_b;
141 src_c6 -= src_a2 * src_b;
142 src_c7 -= src_a3 * src_b;
144 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
145 src_c8 -= src_a0 * src_b;
146 src_c9 -= src_a1 * src_b;
147 src_c10 -= src_a2 * src_b;
148 src_c11 -= src_a3 * src_b;
150 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
151 src_c12 -= src_a0 * src_b;
152 src_c13 -= src_a1 * src_b;
153 src_c14 -= src_a2 * src_b;
154 src_c15 -= src_a3 * src_b;
161 src_b12 = LD_DP(b + 12);
162 src_b13 = (v2f64) __msa_splati_d((v2i64) src_b12, 1);
163 src_b12 = (v2f64) __msa_splati_d((v2i64) src_b12, 0);
164 src_b14 = LD_DP(b + 14);
165 src_b15 = (v2f64) __msa_splati_d((v2i64) src_b14, 1);
166 src_b14 = (v2f64) __msa_splati_d((v2i64) src_b14, 0);
168 src_b8 = LD_DP(b + 8);
169 src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1);
170 src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0);
171 src_b10 = __msa_cast_to_vector_double(*(b + 10));
172 src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
174 src_b0 = __msa_cast_to_vector_double(*(b + 0));
175 src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
176 src_b4 = LD_DP(b + 4);
177 src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1);
178 src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0);
185 src_c8 -= src_c12 * src_b14;
186 src_c9 -= src_c13 * src_b14;
187 src_c10 -= src_c14 * src_b14;
188 src_c11 -= src_c15 * src_b14;
195 src_c4 -= src_c12 * src_b13;
196 src_c5 -= src_c13 * src_b13;
197 src_c6 -= src_c14 * src_b13;
198 src_c7 -= src_c15 * src_b13;
200 src_c4 -= src_c8 * src_b9;
201 src_c5 -= src_c9 * src_b9;
202 src_c6 -= src_c10 * src_b9;
203 src_c7 -= src_c11 * src_b9;
210 src_c0 -= src_c12 * src_b12;
211 src_c1 -= src_c13 * src_b12;
212 src_c2 -= src_c14 * src_b12;
213 src_c3 -= src_c15 * src_b12;
215 src_c0 -= src_c8 * src_b8;
216 src_c1 -= src_c9 * src_b8;
217 src_c2 -= src_c10 * src_b8;
218 src_c3 -= src_c11 * src_b8;
220 src_c0 -= src_c4 * src_b4;
221 src_c1 -= src_c5 * src_b4;
222 src_c2 -= src_c6 * src_b4;
223 src_c3 -= src_c7 * src_b4;
230 ST_DP4(src_c12, src_c13, src_c14, src_c15, c_nxt3line, 2);
231 ST_DP4(src_c12, src_c13, src_c14, src_c15, a + 24, 2);
232 ST_DP4(src_c8, src_c9, src_c10, src_c11, c_nxt2line, 2);
233 ST_DP4(src_c8, src_c9, src_c10, src_c11, a + 16, 2);
234 ST_DP4(src_c4, src_c5, src_c6, src_c7, c_nxt1line, 2);
235 ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2);
236 ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
237 ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
240 static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
242 v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
243 v2f64 src_b0, src_b2, src_b3;
245 LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
246 LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7);
251 FLOAT *pba = a, *pbb = b;
252 v2f64 src_b, src_b1, src_a0, src_a1, src_a2, src_a3;
253 v2f64 src_a4, src_a5, src_a6, src_a7;
255 LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3);
258 for (i = bk - 1; i--;)
263 LD_DP4(pba, 2, src_a4, src_a5, src_a6, src_a7);
266 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
267 src_c0 -= src_a0 * src_b;
268 src_c1 -= src_a1 * src_b;
269 src_c2 -= src_a2 * src_b;
270 src_c3 -= src_a3 * src_b;
272 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
273 src_c4 -= src_a0 * src_b;
274 src_c5 -= src_a1 * src_b;
275 src_c6 -= src_a2 * src_b;
276 src_c7 -= src_a3 * src_b;
285 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
286 src_c0 -= src_a0 * src_b;
287 src_c1 -= src_a1 * src_b;
288 src_c2 -= src_a2 * src_b;
289 src_c3 -= src_a3 * src_b;
291 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
292 src_c4 -= src_a0 * src_b;
293 src_c5 -= src_a1 * src_b;
294 src_c6 -= src_a2 * src_b;
295 src_c7 -= src_a3 * src_b;
301 src_b0 = __msa_cast_to_vector_double(*(b + 0));
302 src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
303 src_b2 = LD_DP(b + 2);
304 src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
305 src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
312 src_c0 -= src_c4 * src_b2;
313 src_c1 -= src_c5 * src_b2;
314 src_c2 -= src_c6 * src_b2;
315 src_c3 -= src_c7 * src_b2;
322 ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
323 ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2);
325 ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
326 ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2);
329 static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
331 v2f64 src_c0, src_c1, src_c2, src_c3;
334 LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
339 FLOAT *aa = a, *bb = b;
340 v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
343 LD_DP4(aa, 2, src_a0, src_a1, src_a2, src_a3);
349 for (i = (bk - 1); i--;)
351 LD_DP4(aa, 2, src_a4, src_a5, src_a6, src_a7);
354 src_b0 = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
355 src_c0 -= src_a0 * src_b0;
356 src_c1 -= src_a1 * src_b0;
357 src_c2 -= src_a2 * src_b0;
358 src_c3 -= src_a3 * src_b0;
370 src_b0 = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
371 src_c0 -= src_a0 * src_b0;
372 src_c1 -= src_a1 * src_b0;
373 src_c2 -= src_a2 * src_b0;
374 src_c3 -= src_a3 * src_b0;
380 src_b0 = __msa_cast_to_vector_double(*b);
381 src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
388 ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
389 ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
392 static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
394 v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
395 v2f64 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13;
396 v2f64 src_b14, src_b15;
398 LD_DP2(c, 2, src_c0, src_c1);
399 LD_DP2(c + ldc, 2, src_c2, src_c3);
400 LD_DP2(c + 2 * ldc, 2, src_c4, src_c5);
401 LD_DP2(c + 3 * ldc, 2, src_c6, src_c7);
406 FLOAT *aa = a, *bb = b;
407 v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
411 LD_DP2(aa, 2, src_a0, src_a1);
412 LD_DP2(bb, 2, src_b0, src_b1);
414 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
415 src_c0 -= src_a0 * src_b;
416 src_c1 -= src_a1 * src_b;
418 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
419 src_c2 -= src_a0 * src_b;
420 src_c3 -= src_a1 * src_b;
422 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
423 src_c4 -= src_a0 * src_b;
424 src_c5 -= src_a1 * src_b;
426 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
427 src_c6 -= src_a0 * src_b;
428 src_c7 -= src_a1 * src_b;
438 src_b12 = LD_DP(b + 12);
439 src_b13 = (v2f64) __msa_splati_d((v2i64) src_b12, 1);
440 src_b12 = (v2f64) __msa_splati_d((v2i64) src_b12, 0);
441 src_b14 = LD_DP(b + 14);
442 src_b15 = (v2f64) __msa_splati_d((v2i64) src_b14, 1);
443 src_b14 = (v2f64) __msa_splati_d((v2i64) src_b14, 0);
445 src_b8 = LD_DP(b + 8);
446 src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1);
447 src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0);
448 src_b10 = __msa_cast_to_vector_double(*(b + 10));
449 src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
451 src_b0 = __msa_cast_to_vector_double(*(b + 0));
452 src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
453 src_b4 = LD_DP(b + 4);
454 src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1);
455 src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0);
460 src_c4 -= src_c6 * src_b14;
461 src_c5 -= src_c7 * src_b14;
466 src_c2 -= src_c6 * src_b13;
467 src_c3 -= src_c7 * src_b13;
469 src_c2 -= src_c4 * src_b9;
470 src_c3 -= src_c5 * src_b9;
475 src_c0 -= src_c6 * src_b12;
476 src_c1 -= src_c7 * src_b12;
478 src_c0 -= src_c4 * src_b8;
479 src_c1 -= src_c5 * src_b8;
481 src_c0 -= src_c2 * src_b4;
482 src_c1 -= src_c3 * src_b4;
487 ST_DP2(src_c6, src_c7, c + 3 * ldc, 2);
488 ST_DP2(src_c4, src_c5, c + 2 * ldc, 2);
489 ST_DP2(src_c2, src_c3, c + ldc, 2);
490 ST_DP2(src_c0, src_c1, c, 2);
492 ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2);
493 ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
496 static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
498 v2f64 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3;
500 LD_DP2(c, 2, src_c0, src_c1);
501 LD_DP2(c + ldc, 2, src_c2, src_c3);
506 FLOAT *aa = a, *bb = b;
507 v2f64 src_a0, src_a1, src_b, src_b0;
511 LD_DP2(aa, 2, src_a0, src_a1);
514 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
515 src_c0 -= src_a0 * src_b;
516 src_c1 -= src_a1 * src_b;
518 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
519 src_c2 -= src_a0 * src_b;
520 src_c3 -= src_a1 * src_b;
530 src_b0 = __msa_cast_to_vector_double(*(b + 0));
531 src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
532 src_b2 = LD_DP(b + 2);
533 src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
534 src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
539 src_c0 -= src_c2 * src_b2;
540 src_c1 -= src_c3 * src_b2;
545 ST_DP2(src_c0, src_c1, c, 2);
546 ST_DP2(src_c2, src_c3, c + ldc, 2);
548 ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
551 static void dsolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
553 FLOAT b0, c0, c1, c2, c3;
563 FLOAT *aa = a, *bb = b;
597 static void dsolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
599 FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15;
600 FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
604 c0_nxt1 = *(c + 0 + 1 * ldc);
605 c1_nxt1 = *(c + 1 + 1 * ldc);
606 c0_nxt2 = *(c + 0 + 2 * ldc);
607 c1_nxt2 = *(c + 1 + 2 * ldc);
608 c0_nxt3 = *(c + 0 + 3 * ldc);
609 c1_nxt3 = *(c + 1 + 3 * ldc);
614 FLOAT *aa = a, *bb = b;
620 c0_nxt1 -= aa[0] * bb[1];
621 c1_nxt1 -= aa[1] * bb[1];
622 c0_nxt2 -= aa[0] * bb[2];
623 c1_nxt2 -= aa[1] * bb[2];
624 c0_nxt3 -= aa[0] * bb[3];
625 c1_nxt3 -= aa[1] * bb[3];
649 c0_nxt2 -= c0_nxt3 * b14;
650 c1_nxt2 -= c1_nxt3 * b14;
654 c0_nxt1 -= c0_nxt3 * b13;
655 c1_nxt1 -= c1_nxt3 * b13;
656 c0_nxt1 -= c0_nxt2 * b9;
657 c1_nxt1 -= c1_nxt2 * b9;
681 *(c + 0 + 1 * ldc) = c0_nxt1;
682 *(c + 1 + 1 * ldc) = c1_nxt1;
683 *(c + 0 + 2 * ldc) = c0_nxt2;
684 *(c + 1 + 2 * ldc) = c1_nxt2;
685 *(c + 0 + 3 * ldc) = c0_nxt3;
686 *(c + 1 + 3 * ldc) = c1_nxt3;
689 static void dsolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
691 FLOAT b0, b2, b3, c0, c1, c0_nxt, c1_nxt;
695 c0_nxt = *(c + 0 + ldc);
696 c1_nxt = *(c + 1 + ldc);
701 FLOAT *aa = a, *bb = b;
708 c0_nxt -= aa[0] * bb[1];
709 c1_nxt -= aa[1] * bb[1];
739 *(c + 0 + ldc) = c0_nxt;
740 *(c + 1 + ldc) = c1_nxt;
743 static void dsolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
753 FLOAT *aa = a, *bb = b;
777 static void dsolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
779 FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15, c0, c1, c2, c3;
789 FLOAT *aa = a, *bb = b;
842 static void dsolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
844 FLOAT b0, b2, b3, c0, c1;
852 FLOAT *aa = a, *bb = b;
883 static void dsolve_1x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
889 for (i = 0; i < bk; i++)
899 int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
900 FLOAT *c, BLASLONG ldc, BLASLONG offset)
919 for (i = (m >> 3); i--;)
921 dsolve_8x1_rt_msa(aa + 8 * kk, bb, cc, (k - kk));
931 dsolve_4x1_rt_msa(aa + 4 * kk, bb, cc, (k - kk));
939 dsolve_2x1_rt_msa(aa + 2 * kk, bb, cc, (k - kk));
947 dsolve_1x1_rt_msa(aa + kk, bb, cc, (k - kk));
966 for (i = (m >> 3); i--;)
968 dsolve_8x2_rt_msa(aa + 8 * kk, bb, cc, ldc, (k - kk));
978 dsolve_4x2_rt_msa(aa + 4 * kk, bb, cc, ldc, (k - kk));
986 dsolve_2x2_rt_msa(aa + 2 * kk, bb, cc, ldc, (k - kk));
994 dsolve_1x2_rt_msa(aa + kk, bb, cc, ldc, (k - kk));
1005 for (j = (n >> 2); j--;)
1013 for (i = (m >> 3); i--;)
1015 dsolve_8x4_rt_msa(aa + kk * 8, bb, cc, ldc, (k - kk));
1025 dsolve_4x4_rt_msa(aa + kk * 4, bb, cc, ldc, (k - kk));
1033 dsolve_2x4_rt_msa(aa + kk * 2, bb, cc, ldc, (k - kk));
1041 dsolve_1x4_rt_msa(aa + kk, bb, cc, ldc, (k - kk));