1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
33 v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
34 v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
35 v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24;
36 v4f32 src_b25, src_b26, src_b27, src_b32, src_b33, src_b34, src_b35;
37 v4f32 src_b36, src_b40, src_b41, src_b42, src_b43, src_b44, src_b45;
38 v4f32 src_b48, src_b49, src_b50, src_b51, src_b52, src_b53, src_b54;
39 v4f32 src_b56, src_b57, src_b58, src_b59, src_b60, src_b61, src_b62, src_b63;
40 FLOAT *c_nxt1line = c + ldc;
41 FLOAT *c_nxt2line = c + 2 * ldc;
42 FLOAT *c_nxt3line = c + 3 * ldc;
43 FLOAT *c_nxt4line = c + 4 * ldc;
44 FLOAT *c_nxt5line = c + 5 * ldc;
45 FLOAT *c_nxt6line = c + 6 * ldc;
46 FLOAT *c_nxt7line = c + 7 * ldc;
48 LD_SP2(c, 4, src_c0, src_c1);
49 LD_SP2(c_nxt1line, 4, src_c2, src_c3);
50 LD_SP2(c_nxt2line, 4, src_c4, src_c5);
51 LD_SP2(c_nxt3line, 4, src_c6, src_c7);
52 LD_SP2(c_nxt4line, 4, src_c8, src_c9);
53 LD_SP2(c_nxt5line, 4, src_c10, src_c11);
54 LD_SP2(c_nxt6line, 4, src_c12, src_c13);
55 LD_SP2(c_nxt7line, 4, src_c14, src_c15);
59 BLASLONG k, pref_offset;
60 FLOAT *aa = a, *bb = b, *pa0_pref;
61 v4f32 src_a0, src_a1, src_b1, src_b2, src_b3, src_bb0, src_bb1;
63 pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1);
67 pref_offset = L1_DATA_LINESIZE - pref_offset;
68 pref_offset = pref_offset / sizeof(FLOAT);
71 pa0_pref = a + pref_offset;
73 for (k = 0; k < (bk >> 1); k++)
75 PREF_OFFSET(pa0_pref, 64);
76 PREF_OFFSET(pa0_pref, 96);
78 LD_SP2_INC(aa, 4, src_a0, src_a1);
79 LD_SP2_INC(bb, 4, src_bb0, src_bb1);
81 SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
82 src_c0 -= src_a0 * src_b0;
83 src_c1 -= src_a1 * src_b0;
84 src_c2 -= src_a0 * src_b1;
85 src_c3 -= src_a1 * src_b1;
86 src_c4 -= src_a0 * src_b2;
87 src_c5 -= src_a1 * src_b2;
88 src_c6 -= src_a0 * src_b3;
89 src_c7 -= src_a1 * src_b3;
91 SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
92 src_c8 -= src_a0 * src_b0;
93 src_c9 -= src_a1 * src_b0;
94 src_c10 -= src_a0 * src_b1;
95 src_c11 -= src_a1 * src_b1;
96 src_c12 -= src_a0 * src_b2;
97 src_c13 -= src_a1 * src_b2;
98 src_c14 -= src_a0 * src_b3;
99 src_c15 -= src_a1 * src_b3;
101 LD_SP2_INC(aa, 4, src_a0, src_a1);
102 LD_SP2_INC(bb, 4, src_bb0, src_bb1);
104 SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
105 src_c0 -= src_a0 * src_b0;
106 src_c1 -= src_a1 * src_b0;
107 src_c2 -= src_a0 * src_b1;
108 src_c3 -= src_a1 * src_b1;
109 src_c4 -= src_a0 * src_b2;
110 src_c5 -= src_a1 * src_b2;
111 src_c6 -= src_a0 * src_b3;
112 src_c7 -= src_a1 * src_b3;
114 SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
115 src_c8 -= src_a0 * src_b0;
116 src_c9 -= src_a1 * src_b0;
117 src_c10 -= src_a0 * src_b1;
118 src_c11 -= src_a1 * src_b1;
119 src_c12 -= src_a0 * src_b2;
120 src_c13 -= src_a1 * src_b2;
121 src_c14 -= src_a0 * src_b3;
122 src_c15 -= src_a1 * src_b3;
129 LD_SP2_INC(aa, 4, src_a0, src_a1);
130 LD_SP2_INC(bb, 4, src_bb0, src_bb1);
132 SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
133 src_c0 -= src_a0 * src_b0;
134 src_c1 -= src_a1 * src_b0;
135 src_c2 -= src_a0 * src_b1;
136 src_c3 -= src_a1 * src_b1;
137 src_c4 -= src_a0 * src_b2;
138 src_c5 -= src_a1 * src_b2;
139 src_c6 -= src_a0 * src_b3;
140 src_c7 -= src_a1 * src_b3;
142 SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
143 src_c8 -= src_a0 * src_b0;
144 src_c9 -= src_a1 * src_b0;
145 src_c10 -= src_a0 * src_b1;
146 src_c11 -= src_a1 * src_b1;
147 src_c12 -= src_a0 * src_b2;
148 src_c13 -= src_a1 * src_b2;
149 src_c14 -= src_a0 * src_b3;
150 src_c15 -= src_a1 * src_b3;
156 src_b = LD_SP(b + 60);
157 SPLATI_W4_SP(src_b, src_b60, src_b61, src_b62, src_b63);
158 src_b = LD_SP(b + 56);
159 SPLATI_W4_SP(src_b, src_b56, src_b57, src_b58, src_b59);
163 src_c13 -= src_c15 * src_b62;
164 src_c12 -= src_c14 * src_b62;
165 src_c11 -= src_c15 * src_b61;
166 src_c10 -= src_c14 * src_b61;
167 src_c9 -= src_c15 * src_b60;
168 src_c8 -= src_c14 * src_b60;
169 src_c7 -= src_c15 * src_b59;
170 src_c6 -= src_c14 * src_b59;
171 src_c5 -= src_c15 * src_b58;
172 src_c4 -= src_c14 * src_b58;
173 src_c3 -= src_c15 * src_b57;
174 src_c2 -= src_c14 * src_b57;
175 src_c1 -= src_c15 * src_b56;
176 src_c0 -= src_c14 * src_b56;
178 src_b = LD_SP(b + 48);
179 SPLATI_W4_SP(src_b, src_b48, src_b49, src_b50, src_b51);
180 src_b52 = LD_SP(b + 52);
181 src_b54 = (v4f32) __msa_splati_w((v4i32) src_b52, 2);
182 src_b53 = (v4f32) __msa_splati_w((v4i32) src_b52, 1);
183 src_b52 = (v4f32) __msa_splati_w((v4i32) src_b52, 0);
187 src_c10 -= src_c12 * src_b53;
188 src_c11 -= src_c13 * src_b53;
189 src_c8 -= src_c12 * src_b52;
190 src_c9 -= src_c13 * src_b52;
191 src_c6 -= src_c12 * src_b51;
192 src_c7 -= src_c13 * src_b51;
193 src_c4 -= src_c12 * src_b50;
194 src_c5 -= src_c13 * src_b50;
195 src_c2 -= src_c12 * src_b49;
196 src_c3 -= src_c13 * src_b49;
197 src_c0 -= src_c12 * src_b48;
198 src_c1 -= src_c13 * src_b48;
200 ST_SP4(src_c12, src_c13, src_c14, src_c15, a - 16, 4);
201 ST_SP2(src_c12, src_c13, c_nxt6line, 4);
202 ST_SP2(src_c14, src_c15, c_nxt7line, 4);
204 src_b = LD_SP(b + 40);
205 SPLATI_W4_SP(src_b, src_b40, src_b41, src_b42, src_b43);
206 src_b44 = LD_SP(b + 44);
207 src_b45 = (v4f32) __msa_splati_w((v4i32) src_b44, 1);
208 src_b44 = (v4f32) __msa_splati_w((v4i32) src_b44, 0);
212 src_c8 -= src_c10 * src_b44;
213 src_c9 -= src_c11 * src_b44;
214 src_c6 -= src_c10 * src_b43;
215 src_c7 -= src_c11 * src_b43;
216 src_c4 -= src_c10 * src_b42;
217 src_c5 -= src_c11 * src_b42;
218 src_c2 -= src_c10 * src_b41;
219 src_c3 -= src_c11 * src_b41;
220 src_c0 -= src_c10 * src_b40;
221 src_c1 -= src_c11 * src_b40;
223 src_b = LD_SP(b + 32);
224 SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35);
225 src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36));
229 src_c6 -= src_c8 * src_b35;
230 src_c7 -= src_c9 * src_b35;
231 src_c4 -= src_c8 * src_b34;
232 src_c5 -= src_c9 * src_b34;
233 src_c2 -= src_c8 * src_b33;
234 src_c3 -= src_c9 * src_b33;
235 src_c0 -= src_c8 * src_b32;
236 src_c1 -= src_c9 * src_b32;
238 ST_SP4(src_c8, src_c9, src_c10, src_c11, a - 32, 4);
239 ST_SP2(src_c8, src_c9, c_nxt4line, 4);
240 ST_SP2(src_c10, src_c11, c_nxt5line, 4);
242 src_b = LD_SP(b + 24);
243 SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27);
247 src_c4 -= src_c6 * src_b26;
248 src_c5 -= src_c7 * src_b26;
249 src_c2 -= src_c6 * src_b25;
250 src_c3 -= src_c7 * src_b25;
251 src_c0 -= src_c6 * src_b24;
252 src_c1 -= src_c7 * src_b24;
254 src_b16 = LD_SP(b + 16);
255 src_b18 = (v4f32) __msa_splati_w((v4i32) src_b16, 2);
256 src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1);
257 src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0);
261 src_c2 -= src_c4 * src_b17;
262 src_c3 -= src_c5 * src_b17;
263 src_c0 -= src_c4 * src_b16;
264 src_c1 -= src_c5 * src_b16;
266 ST_SP4(src_c4, src_c5, src_c6, src_c7, a - 48, 4);
267 ST_SP2(src_c4, src_c5, c_nxt2line, 4);
268 ST_SP2(src_c6, src_c7, c_nxt3line, 4);
270 src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9));
271 src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8));
272 src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
276 src_c0 -= src_c2 * src_b8;
277 src_c1 -= src_c3 * src_b8;
282 ST_SP4(src_c0, src_c1, src_c2, src_c3, a - 64, 4);
284 ST_SP2(src_c0, src_c1, c, 4);
285 ST_SP2(src_c2, src_c3, c_nxt1line, 4);
288 static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
291 FLOAT *aa = a, *bb = b;
292 v4f32 src_a0, src_a1, src_b1, src_b2, src_b3;
293 v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
294 v4f32 src_b, src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12;
295 v4f32 src_b13, src_b14, src_b15;
296 FLOAT *c_nxt1line = c + ldc;
297 FLOAT *c_nxt2line = c + 2 * ldc;
298 FLOAT *c_nxt3line = c + 3 * ldc;
300 LD_SP2(c, 4, src_c0, src_c1);
301 LD_SP2(c_nxt1line, 4, src_c2, src_c3);
302 LD_SP2(c_nxt2line, 4, src_c4, src_c5);
303 LD_SP2(c_nxt3line, 4, src_c6, src_c7);
305 for (k = 0; k < (bk >> 1); k++)
307 LD_SP2(aa, 4, src_a0, src_a1);
309 src_b = LD_SP(bb + 0);
310 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
311 src_c0 -= src_a0 * src_b0;
312 src_c1 -= src_a1 * src_b0;
313 src_c2 -= src_a0 * src_b1;
314 src_c3 -= src_a1 * src_b1;
315 src_c4 -= src_a0 * src_b2;
316 src_c5 -= src_a1 * src_b2;
317 src_c6 -= src_a0 * src_b3;
318 src_c7 -= src_a1 * src_b3;
323 LD_SP2(aa, 4, src_a0, src_a1);
325 src_b = LD_SP(bb + 0);
326 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
327 src_c0 -= src_a0 * src_b0;
328 src_c1 -= src_a1 * src_b0;
329 src_c2 -= src_a0 * src_b1;
330 src_c3 -= src_a1 * src_b1;
331 src_c4 -= src_a0 * src_b2;
332 src_c5 -= src_a1 * src_b2;
333 src_c6 -= src_a0 * src_b3;
334 src_c7 -= src_a1 * src_b3;
340 if ((bk & 1) && (bk > 0))
342 LD_SP2(aa, 4, src_a0, src_a1);
344 src_b = LD_SP(bb + 0);
345 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
346 src_c0 -= src_a0 * src_b0;
347 src_c1 -= src_a1 * src_b0;
348 src_c2 -= src_a0 * src_b1;
349 src_c3 -= src_a1 * src_b1;
350 src_c4 -= src_a0 * src_b2;
351 src_c5 -= src_a1 * src_b2;
352 src_c6 -= src_a0 * src_b3;
353 src_c7 -= src_a1 * src_b3;
359 src_b = LD_SP(b + 12);
360 SPLATI_W4_SP(src_b, src_b12, src_b13, src_b14, src_b15);
361 src_b8 = LD_SP(b + 8);
362 src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2);
363 src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1);
364 src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0);
365 src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5));
366 src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4));
367 src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
371 src_c5 -= src_c7 * src_b14;
372 src_c4 -= src_c6 * src_b14;
373 src_c3 -= src_c7 * src_b13;
374 src_c2 -= src_c6 * src_b13;
375 src_c1 -= src_c7 * src_b12;
376 src_c0 -= src_c6 * src_b12;
380 src_c3 -= src_c5 * src_b9;
381 src_c2 -= src_c4 * src_b9;
382 src_c1 -= src_c5 * src_b8;
383 src_c0 -= src_c4 * src_b8;
387 src_c1 -= src_c3 * src_b4;
388 src_c0 -= src_c2 * src_b4;
393 ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
394 ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4);
396 ST_SP2(src_c0, src_c1, c, 4);
397 ST_SP2(src_c2, src_c3, c_nxt1line, 4);
398 ST_SP2(src_c4, src_c5, c_nxt2line, 4);
399 ST_SP2(src_c6, src_c7, c_nxt3line, 4);
402 static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
405 FLOAT *aa = a, *bb = b;
406 v4f32 src_a0, src_a1, src_b1;
407 v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3;
408 FLOAT *c_nxt1line = c + ldc;
410 LD_SP2(c, 4, src_c0, src_c1);
411 LD_SP2(c_nxt1line, 4, src_c2, src_c3);
413 for (k = 0; k < (bk >> 1); k++)
415 LD_SP2(aa, 4, src_a0, src_a1);
417 src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
418 src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1));
420 src_c0 -= src_a0 * src_b0;
421 src_c1 -= src_a1 * src_b0;
422 src_c2 -= src_a0 * src_b1;
423 src_c3 -= src_a1 * src_b1;
428 LD_SP2(aa, 4, src_a0, src_a1);
430 src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
431 src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1));
433 src_c0 -= src_a0 * src_b0;
434 src_c1 -= src_a1 * src_b0;
435 src_c2 -= src_a0 * src_b1;
436 src_c3 -= src_a1 * src_b1;
442 if ((bk & 1) && (bk > 0))
444 LD_SP2(aa, 4, src_a0, src_a1);
446 src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
447 src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1));
449 src_c0 -= src_a0 * src_b0;
450 src_c1 -= src_a1 * src_b0;
451 src_c2 -= src_a0 * src_b1;
452 src_c3 -= src_a1 * src_b1;
458 src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
459 src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2));
460 src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
464 src_c0 -= src_c2 * src_b2;
465 src_c1 -= src_c3 * src_b2;
469 ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
470 ST_SP2(src_c0, src_c1, c, 4);
471 ST_SP2(src_c2, src_c3, c_nxt1line, 4);
474 static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
477 FLOAT *aa = a, *bb = b;
478 v4f32 src_a0, src_a1, src_c0, src_c1, src_b0;
480 LD_SP2(c, 4, src_c0, src_c1);
482 for (k = 0; k < (bk >> 2); k++)
484 LD_SP2(aa, 4, src_a0, src_a1);
486 src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
488 src_c0 -= src_a0 * src_b0;
489 src_c1 -= src_a1 * src_b0;
494 LD_SP2(aa, 4, src_a0, src_a1);
496 src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
498 src_c0 -= src_a0 * src_b0;
499 src_c1 -= src_a1 * src_b0;
504 LD_SP2(aa, 4, src_a0, src_a1);
506 src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
508 src_c0 -= src_a0 * src_b0;
509 src_c1 -= src_a1 * src_b0;
514 LD_SP2(aa, 4, src_a0, src_a1);
516 src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
518 src_c0 -= src_a0 * src_b0;
519 src_c1 -= src_a1 * src_b0;
525 if ((bk & 3) && (bk > 0))
529 LD_SP2(aa, 4, src_a0, src_a1);
531 src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
533 src_c0 -= src_a0 * src_b0;
534 src_c1 -= src_a1 * src_b0;
539 LD_SP2(aa, 4, src_a0, src_a1);
541 src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
543 src_c0 -= src_a0 * src_b0;
544 src_c1 -= src_a1 * src_b0;
552 LD_SP2(aa, 4, src_a0, src_a1);
554 src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
556 src_c0 -= src_a0 * src_b0;
557 src_c1 -= src_a1 * src_b0;
564 src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
569 ST_SP2(src_c0, src_c1, a, 4);
570 ST_SP2(src_c0, src_c1, c, 4);
573 static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
576 FLOAT *aa = a, *bb = b;
577 v4f32 src_a0, src_b1, src_b2, src_b3;
578 v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
579 v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24;
580 v4f32 src_b25, src_b26, src_b27, src_b32, src_b33, src_b34, src_b35;
581 v4f32 src_b36, src_b40, src_b41, src_b42, src_b43, src_b44, src_b45;
582 v4f32 src_b48, src_b49, src_b50, src_b51, src_b52, src_b53, src_b54;
583 v4f32 src_b56, src_b57, src_b58, src_b59, src_b60, src_b61, src_b62, src_b63;
584 FLOAT *c_nxt1line = c + ldc;
585 FLOAT *c_nxt2line = c + 2 * ldc;
586 FLOAT *c_nxt3line = c + 3 * ldc;
587 FLOAT *c_nxt4line = c + 4 * ldc;
588 FLOAT *c_nxt5line = c + 5 * ldc;
589 FLOAT *c_nxt6line = c + 6 * ldc;
590 FLOAT *c_nxt7line = c + 7 * ldc;
593 src_c1 = LD_SP(c_nxt1line);
594 src_c2 = LD_SP(c_nxt2line);
595 src_c3 = LD_SP(c_nxt3line);
596 src_c4 = LD_SP(c_nxt4line);
597 src_c5 = LD_SP(c_nxt5line);
598 src_c6 = LD_SP(c_nxt6line);
599 src_c7 = LD_SP(c_nxt7line);
601 for (k = 0; k < bk; k++)
605 src_b = LD_SP(bb + 0);
606 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
607 src_c0 -= src_a0 * src_b0;
608 src_c1 -= src_a0 * src_b1;
609 src_c2 -= src_a0 * src_b2;
610 src_c3 -= src_a0 * src_b3;
612 src_b = LD_SP(bb + 4);
613 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
614 src_c4 -= src_a0 * src_b0;
615 src_c5 -= src_a0 * src_b1;
616 src_c6 -= src_a0 * src_b2;
617 src_c7 -= src_a0 * src_b3;
626 src_b = LD_SP(b + 60);
627 SPLATI_W4_SP(src_b, src_b60, src_b61, src_b62, src_b63);
628 src_b = LD_SP(b + 56);
629 SPLATI_W4_SP(src_b, src_b56, src_b57, src_b58, src_b59);
631 src_b = LD_SP(b + 48);
632 SPLATI_W4_SP(src_b, src_b48, src_b49, src_b50, src_b51);
633 src_b52 = LD_SP(b + 52);
634 src_b54 = (v4f32) __msa_splati_w((v4i32) src_b52, 2);
635 src_b53 = (v4f32) __msa_splati_w((v4i32) src_b52, 1);
636 src_b52 = (v4f32) __msa_splati_w((v4i32) src_b52, 0);
638 src_b = LD_SP(b + 40);
639 SPLATI_W4_SP(src_b, src_b40, src_b41, src_b42, src_b43);
640 src_b44 = LD_SP(b + 44);
641 src_b45 = (v4f32) __msa_splati_w((v4i32) src_b44, 1);
642 src_b44 = (v4f32) __msa_splati_w((v4i32) src_b44, 0);
644 src_b = LD_SP(b + 32);
645 SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35);
646 src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36));
648 src_b = LD_SP(b + 24);
649 SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27);
651 src_b16 = LD_SP(b + 16);
652 src_b18 = (v4f32) __msa_splati_w((v4i32) src_b16, 2);
653 src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1);
654 src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0);
656 src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9));
657 src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8));
658 src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
661 src_c6 -= src_c7 * src_b62;
662 src_c5 -= src_c7 * src_b61;
663 src_c4 -= src_c7 * src_b60;
664 src_c3 -= src_c7 * src_b59;
665 src_c2 -= src_c7 * src_b58;
666 src_c1 -= src_c7 * src_b57;
667 src_c0 -= src_c7 * src_b56;
670 src_c5 -= src_c6 * src_b53;
671 src_c4 -= src_c6 * src_b52;
672 src_c3 -= src_c6 * src_b51;
673 src_c2 -= src_c6 * src_b50;
674 src_c1 -= src_c6 * src_b49;
675 src_c0 -= src_c6 * src_b48;
678 src_c4 -= src_c5 * src_b44;
679 src_c3 -= src_c5 * src_b43;
680 src_c2 -= src_c5 * src_b42;
681 src_c1 -= src_c5 * src_b41;
682 src_c0 -= src_c5 * src_b40;
685 src_c3 -= src_c4 * src_b35;
686 src_c2 -= src_c4 * src_b34;
687 src_c1 -= src_c4 * src_b33;
688 src_c0 -= src_c4 * src_b32;
691 src_c2 -= src_c3 * src_b26;
692 src_c1 -= src_c3 * src_b25;
693 src_c0 -= src_c3 * src_b24;
696 src_c1 -= src_c2 * src_b17;
697 src_c0 -= src_c2 * src_b16;
700 src_c0 -= src_c1 * src_b8;
704 ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
705 ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4);
708 ST_SP(src_c1, c_nxt1line);
709 ST_SP(src_c2, c_nxt2line);
710 ST_SP(src_c3, c_nxt3line);
711 ST_SP(src_c4, c_nxt4line);
712 ST_SP(src_c5, c_nxt5line);
713 ST_SP(src_c6, c_nxt6line);
714 ST_SP(src_c7, c_nxt7line);
717 static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
720 FLOAT *aa = a, *bb = b;
721 v4f32 src_c0, src_c1, src_c2, src_c3, src_b;
722 v4f32 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13;
723 v4f32 src_b14, src_b15, src_a, src_b1, src_b2, src_b3;
724 FLOAT *c_nxt1line = c + ldc;
725 FLOAT *c_nxt2line = c + 2 * ldc;
726 FLOAT *c_nxt3line = c + 3 * ldc;
729 src_c1 = LD_SP(c_nxt1line);
730 src_c2 = LD_SP(c_nxt2line);
731 src_c3 = LD_SP(c_nxt3line);
733 for (k = 0; k < (bk >> 1); k++)
738 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
740 src_c0 -= src_a * src_b0;
741 src_c1 -= src_a * src_b1;
742 src_c2 -= src_a * src_b2;
743 src_c3 -= src_a * src_b3;
751 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
753 src_c0 -= src_a * src_b0;
754 src_c1 -= src_a * src_b1;
755 src_c2 -= src_a * src_b2;
756 src_c3 -= src_a * src_b3;
762 if ((bk & 1) && (bk > 0))
767 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
769 src_c0 -= src_a * src_b0;
770 src_c1 -= src_a * src_b1;
771 src_c2 -= src_a * src_b2;
772 src_c3 -= src_a * src_b3;
778 src_b = LD_SP(b + 12);
779 SPLATI_W4_SP(src_b, src_b12, src_b13, src_b14, src_b15);
780 src_b8 = LD_SP(b + 8);
781 src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2);
782 src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1);
783 src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0);
784 src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5));
785 src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4));
786 src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
789 src_c2 -= src_c3 * src_b14;
790 src_c1 -= src_c3 * src_b13;
791 src_c0 -= src_c3 * src_b12;
794 src_c1 -= src_c2 * src_b9;
795 src_c0 -= src_c2 * src_b8;
798 src_c0 -= src_c1 * src_b4;
802 ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
805 ST_SP(src_c1, c_nxt1line);
806 ST_SP(src_c2, c_nxt2line);
807 ST_SP(src_c3, c_nxt3line);
810 static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
813 FLOAT *aa = a, *bb = b;
814 v4f32 src_a, src_b1, src_c0, src_c1, src_b0, src_b2, src_b3;
815 FLOAT *c_nxt1line = c + ldc;
818 src_c1 = LD_SP(c_nxt1line);
820 for (k = 0; k < (bk >> 2); k++)
824 src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
825 src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
827 src_c0 -= src_a * src_b0;
828 src_c1 -= src_a * src_b1;
835 src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
836 src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
838 src_c0 -= src_a * src_b0;
839 src_c1 -= src_a * src_b1;
846 src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
847 src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
849 src_c0 -= src_a * src_b0;
850 src_c1 -= src_a * src_b1;
857 src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
858 src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
860 src_c0 -= src_a * src_b0;
861 src_c1 -= src_a * src_b1;
867 if ((bk & 3) && (bk > 0))
873 src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
874 src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
876 src_c0 -= src_a * src_b0;
877 src_c1 -= src_a * src_b1;
884 src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
885 src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
887 src_c0 -= src_a * src_b0;
888 src_c1 -= src_a * src_b1;
898 src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
899 src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
901 src_c0 -= src_a * src_b0;
902 src_c1 -= src_a * src_b1;
909 src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
910 src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2));
911 src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
914 src_c0 -= src_c1 * src_b2;
917 ST_SP2(src_c0, src_c1, a, 4);
920 ST_SP(src_c1, c_nxt1line);
923 static void ssolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
926 FLOAT *aa = a, *bb = b;
927 FLOAT b0, c0, c1, c2, c3;
934 for (k = 0; k < bk; k++)
966 static void ssolve_2x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
969 FLOAT *aa = a, *bb = b;
970 FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35;
971 FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54;
972 FLOAT b56, b57, b58, b59, b60, b61, b62, b63, c0_nxt7, c1_nxt7;
973 FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
974 FLOAT c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6;
978 c0_nxt1 = *(c + 0 + 1 * ldc);
979 c1_nxt1 = *(c + 1 + 1 * ldc);
980 c0_nxt2 = *(c + 0 + 2 * ldc);
981 c1_nxt2 = *(c + 1 + 2 * ldc);
982 c0_nxt3 = *(c + 0 + 3 * ldc);
983 c1_nxt3 = *(c + 1 + 3 * ldc);
984 c0_nxt4 = *(c + 0 + 4 * ldc);
985 c1_nxt4 = *(c + 1 + 4 * ldc);
986 c0_nxt5 = *(c + 0 + 5 * ldc);
987 c1_nxt5 = *(c + 1 + 5 * ldc);
988 c0_nxt6 = *(c + 0 + 6 * ldc);
989 c1_nxt6 = *(c + 1 + 6 * ldc);
990 c0_nxt7 = *(c + 0 + 7 * ldc);
991 c1_nxt7 = *(c + 1 + 7 * ldc);
993 for (k = 0; k < bk; k++)
997 c0_nxt1 -= aa[0] * bb[1];
998 c1_nxt1 -= aa[1] * bb[1];
999 c0_nxt2 -= aa[0] * bb[2];
1000 c1_nxt2 -= aa[1] * bb[2];
1001 c0_nxt3 -= aa[0] * bb[3];
1002 c1_nxt3 -= aa[1] * bb[3];
1003 c0_nxt4 -= aa[0] * bb[4];
1004 c1_nxt4 -= aa[1] * bb[4];
1005 c0_nxt5 -= aa[0] * bb[5];
1006 c1_nxt5 -= aa[1] * bb[5];
1007 c0_nxt6 -= aa[0] * bb[6];
1008 c1_nxt6 -= aa[1] * bb[6];
1009 c0_nxt7 -= aa[0] * bb[7];
1010 c1_nxt7 -= aa[1] * bb[7];
1059 c0_nxt6 -= c0_nxt7 * b62;
1060 c1_nxt6 -= c1_nxt7 * b62;
1065 c0_nxt5 -= c0_nxt7 * b61;
1066 c1_nxt5 -= c1_nxt7 * b61;
1068 c0_nxt5 -= c0_nxt6 * b53;
1069 c1_nxt5 -= c1_nxt6 * b53;
1074 c0_nxt4 -= c0_nxt7 * b60;
1075 c1_nxt4 -= c1_nxt7 * b60;
1077 c0_nxt4 -= c0_nxt6 * b52;
1078 c1_nxt4 -= c1_nxt6 * b52;
1080 c0_nxt4 -= c0_nxt5 * b44;
1081 c1_nxt4 -= c1_nxt5 * b44;
1086 c0_nxt3 -= c0_nxt7 * b59;
1087 c1_nxt3 -= c1_nxt7 * b59;
1089 c0_nxt3 -= c0_nxt6 * b51;
1090 c1_nxt3 -= c1_nxt6 * b51;
1092 c0_nxt3 -= c0_nxt5 * b43;
1093 c1_nxt3 -= c1_nxt5 * b43;
1095 c0_nxt3 -= c0_nxt4 * b35;
1096 c1_nxt3 -= c1_nxt4 * b35;
1101 c0_nxt2 -= c0_nxt7 * b58;
1102 c1_nxt2 -= c1_nxt7 * b58;
1104 c0_nxt2 -= c0_nxt6 * b50;
1105 c1_nxt2 -= c1_nxt6 * b50;
1107 c0_nxt2 -= c0_nxt5 * b42;
1108 c1_nxt2 -= c1_nxt5 * b42;
1110 c0_nxt2 -= c0_nxt4 * b34;
1111 c1_nxt2 -= c1_nxt4 * b34;
1113 c0_nxt2 -= c0_nxt3 * b26;
1114 c1_nxt2 -= c1_nxt3 * b26;
1119 c0_nxt1 -= c0_nxt7 * b57;
1120 c1_nxt1 -= c1_nxt7 * b57;
1122 c0_nxt1 -= c0_nxt6 * b49;
1123 c1_nxt1 -= c1_nxt6 * b49;
1125 c0_nxt1 -= c0_nxt5 * b41;
1126 c1_nxt1 -= c1_nxt5 * b41;
1128 c0_nxt1 -= c0_nxt4 * b33;
1129 c1_nxt1 -= c1_nxt4 * b33;
1131 c0_nxt1 -= c0_nxt3 * b25;
1132 c1_nxt1 -= c1_nxt3 * b25;
1134 c0_nxt1 -= c0_nxt2 * b17;
1135 c1_nxt1 -= c1_nxt2 * b17;
1140 c0 -= c0_nxt7 * b56;
1141 c1 -= c1_nxt7 * b56;
1143 c0 -= c0_nxt6 * b48;
1144 c1 -= c1_nxt6 * b48;
1146 c0 -= c0_nxt5 * b40;
1147 c1 -= c1_nxt5 * b40;
1149 c0 -= c0_nxt4 * b32;
1150 c1 -= c1_nxt4 * b32;
1152 c0 -= c0_nxt3 * b24;
1153 c1 -= c1_nxt3 * b24;
1155 c0 -= c0_nxt2 * b16;
1156 c1 -= c1_nxt2 * b16;
1174 *(a + 10) = c0_nxt5;
1175 *(a + 11) = c1_nxt5;
1176 *(a + 12) = c0_nxt6;
1177 *(a + 13) = c1_nxt6;
1178 *(a + 14) = c0_nxt7;
1179 *(a + 15) = c1_nxt7;
1183 *(c + 0 + 1 * ldc) = c0_nxt1;
1184 *(c + 1 + 1 * ldc) = c1_nxt1;
1185 *(c + 0 + 2 * ldc) = c0_nxt2;
1186 *(c + 1 + 2 * ldc) = c1_nxt2;
1187 *(c + 0 + 3 * ldc) = c0_nxt3;
1188 *(c + 1 + 3 * ldc) = c1_nxt3;
1189 *(c + 0 + 4 * ldc) = c0_nxt4;
1190 *(c + 1 + 4 * ldc) = c1_nxt4;
1191 *(c + 0 + 5 * ldc) = c0_nxt5;
1192 *(c + 1 + 5 * ldc) = c1_nxt5;
1193 *(c + 0 + 6 * ldc) = c0_nxt6;
1194 *(c + 1 + 6 * ldc) = c1_nxt6;
1195 *(c + 0 + 7 * ldc) = c0_nxt7;
1196 *(c + 1 + 7 * ldc) = c1_nxt7;
1199 static void ssolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1202 FLOAT *aa = a, *bb = b;
1203 FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15;
1204 FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
1208 c0_nxt1 = *(c + 0 + 1 * ldc);
1209 c1_nxt1 = *(c + 1 + 1 * ldc);
1210 c0_nxt2 = *(c + 0 + 2 * ldc);
1211 c1_nxt2 = *(c + 1 + 2 * ldc);
1212 c0_nxt3 = *(c + 0 + 3 * ldc);
1213 c1_nxt3 = *(c + 1 + 3 * ldc);
1215 for (k = 0; k < bk; k++)
1217 c0 -= aa[0] * bb[0];
1218 c1 -= aa[1] * bb[0];
1219 c0_nxt1 -= aa[0] * bb[1];
1220 c1_nxt1 -= aa[1] * bb[1];
1221 c0_nxt2 -= aa[0] * bb[2];
1222 c1_nxt2 -= aa[1] * bb[2];
1223 c0_nxt3 -= aa[0] * bb[3];
1224 c1_nxt3 -= aa[1] * bb[3];
1247 c0_nxt2 = (c0_nxt2 - c0_nxt3 * b14) * b10;
1248 c1_nxt2 = (c1_nxt2 - c1_nxt3 * b14) * b10;
1250 c0_nxt1 = ((c0_nxt1 - c0_nxt3 * b13) - c0_nxt2 * b9) * b5;
1251 c1_nxt1 = ((c1_nxt1 - c1_nxt3 * b13) - c1_nxt2 * b9) * b5;
1253 c0 = (((c0 - c0_nxt3 * b12) - c0_nxt2 * b8) - c0_nxt1 * b4) * b0;
1254 c1 = (((c1 - c1_nxt3 * b12) - c1_nxt2 * b8) - c1_nxt1 * b4) * b0;
1267 *(c + 0 + 1 * ldc) = c0_nxt1;
1268 *(c + 1 + 1 * ldc) = c1_nxt1;
1269 *(c + 0 + 2 * ldc) = c0_nxt2;
1270 *(c + 1 + 2 * ldc) = c1_nxt2;
1271 *(c + 0 + 3 * ldc) = c0_nxt3;
1272 *(c + 1 + 3 * ldc) = c1_nxt3;
1275 static void ssolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1278 FLOAT *aa = a, *bb = b;
1279 FLOAT b0, b2, b3, c0, c1, c0_nxt, c1_nxt;
1283 c0_nxt = *(c + 0 + ldc);
1284 c1_nxt = *(c + 1 + ldc);
1286 for (k = 0; k < bk; k++)
1288 c0 -= aa[0] * bb[0];
1289 c1 -= aa[1] * bb[0];
1290 c0_nxt -= aa[0] * bb[1];
1291 c1_nxt -= aa[1] * bb[1];
1320 *(c + 0 + ldc) = c0_nxt;
1321 *(c + 1 + ldc) = c1_nxt;
1324 static void ssolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
1327 FLOAT *aa = a, *bb = b;
1333 for (k = 0; k < bk; k++)
1335 c0 -= aa[0] * bb[0];
1336 c1 -= aa[1] * bb[0];
1357 static void ssolve_1x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1360 FLOAT *aa = a, *bb = b;
1361 FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35;
1362 FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54;
1363 FLOAT b56, b57, b58, b59, b60, b61, b62, b63;
1364 FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
1367 c1 = *(c + 1 * ldc);
1368 c2 = *(c + 2 * ldc);
1369 c3 = *(c + 3 * ldc);
1370 c4 = *(c + 4 * ldc);
1371 c5 = *(c + 5 * ldc);
1372 c6 = *(c + 6 * ldc);
1373 c7 = *(c + 7 * ldc);
1375 for (k = 0; k < bk; k++)
1377 c0 -= aa[0] * bb[0];
1378 c1 -= aa[0] * bb[1];
1379 c2 -= aa[0] * bb[2];
1380 c3 -= aa[0] * bb[3];
1381 c4 -= aa[0] * bb[4];
1382 c5 -= aa[0] * bb[5];
1383 c6 -= aa[0] * bb[6];
1384 c7 -= aa[0] * bb[7];
1484 *(c + 1 * ldc) = c1;
1485 *(c + 2 * ldc) = c2;
1486 *(c + 3 * ldc) = c3;
1487 *(c + 4 * ldc) = c4;
1488 *(c + 5 * ldc) = c5;
1489 *(c + 6 * ldc) = c6;
1490 *(c + 7 * ldc) = c7;
1493 static void ssolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1496 FLOAT *aa = a, *bb = b;
1497 FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15;
1498 FLOAT c0, c1, c2, c3;
1501 c1 = *(c + 1 * ldc);
1502 c2 = *(c + 2 * ldc);
1503 c3 = *(c + 3 * ldc);
1505 for (k = 0; k < bk; k++)
1507 c0 -= aa[0] * bb[0];
1508 c1 -= aa[0] * bb[1];
1509 c2 -= aa[0] * bb[2];
1510 c3 -= aa[0] * bb[3];
1531 c2 = (c2 - c3 * b14) * b10;
1532 c1 = ((c1 - c3 * b13) - c2 * b9) * b5;
1533 c0 = (((c0 - c3 * b12) - c2 * b8) - c1 * b4) * b0;
1541 *(c + 1 * ldc) = c1;
1542 *(c + 2 * ldc) = c2;
1543 *(c + 3 * ldc) = c3;
1546 static void ssolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1549 FLOAT *aa = a, *bb = b;
1550 FLOAT b0, b2, b3, c0, c1;
1555 for (k = 0; k < bk; k++)
1557 c0 -= aa[0] * bb[0];
1558 c1 -= aa[0] * bb[1];
1583 static void ssolve_1x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
1587 for (k = 0; k < bk; k++)
1596 int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
1597 FLOAT *c, BLASLONG ldc, BLASLONG offset)
1615 for (i = (m >> 3); i--;)
1617 ssolve_8x1_rt_msa(aa + 8 * kk, b + kk, cc, (k - kk));
1627 ssolve_4x1_rt_msa(aa + 4 * kk, b + kk, cc, (k - kk));
1635 ssolve_2x1_rt_msa(aa + 2 * kk, b + kk, cc, (k - kk));
1643 ssolve_1x1_rt_msa(b + kk, aa + kk, cc, (k - kk));
1660 for (i = (m >> 3); i--;)
1662 ssolve_8x2_rt_msa(aa + 8 * kk, b + 2 * kk, cc, ldc, (k - kk));
1672 ssolve_4x2_rt_msa(aa + 4 * kk, b + 2 * kk, cc, ldc, (k - kk));
1680 ssolve_2x2_rt_msa(aa + 2 * kk, b + 2 * kk, cc, ldc, (k - kk));
1688 ssolve_1x2_rt_msa(aa + kk, b + 2 * kk, cc, ldc, (k - kk));
1705 for (i = (m >> 3); i--;)
1707 ssolve_8x4_rt_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk));
1717 ssolve_4x4_rt_msa(aa + 4 * kk, b + 4 * kk, cc, ldc, (k - kk));
1725 ssolve_2x4_rt_msa(aa + 2 * kk, b + 4 * kk, cc, ldc, (k - kk));
1733 ssolve_1x4_rt_msa(aa + kk, b + 4 * kk, cc, ldc, (k - kk));
1744 for (j = (n >> 3); j--;)
1751 for (i = (m >> 3); i--;)
1753 ssolve_8x8_rt_msa(aa + 8 * kk, b + 8 * kk, cc, ldc, (k - kk));
1763 ssolve_4x8_rt_msa(aa + 4 * kk, b + 8 * kk, cc, ldc, (k - kk));
1771 ssolve_2x8_rt_msa(aa + 2 * kk, b + 8 * kk, cc, ldc, (k - kk));
1779 ssolve_1x8_rt_msa(aa + kk, b + 8 * kk, cc, ldc, (k - kk));