1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
33 v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
34 v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
35 v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7;
36 v4f32 src_b9, src_b10, src_b11, src_b12, src_b13, src_b14, src_b15, src_b18;
37 v4f32 src_b19, src_b20, src_b21, src_b22, src_b23, src_b27, src_b28;
38 v4f32 src_b29, src_b30, src_b31, src_b36, src_b37, src_b38, src_b39;
39 v4f32 src_b45, src_b46, src_b47, src_b54, src_b55, src_b63, src_b;
40 FLOAT *c_nxt1line = c + ldc;
41 FLOAT *c_nxt2line = c + 2 * ldc;
42 FLOAT *c_nxt3line = c + 3 * ldc;
43 FLOAT *c_nxt4line = c + 4 * ldc;
44 FLOAT *c_nxt5line = c + 5 * ldc;
45 FLOAT *c_nxt6line = c + 6 * ldc;
46 FLOAT *c_nxt7line = c + 7 * ldc;
48 LD_SP2(c, 4, src_c0, src_c1);
49 LD_SP2(c_nxt1line, 4, src_c2, src_c3);
50 LD_SP2(c_nxt2line, 4, src_c4, src_c5);
51 LD_SP2(c_nxt3line, 4, src_c6, src_c7);
52 LD_SP2(c_nxt4line, 4, src_c8, src_c9);
53 LD_SP2(c_nxt5line, 4, src_c10, src_c11);
54 LD_SP2(c_nxt6line, 4, src_c12, src_c13);
55 LD_SP2(c_nxt7line, 4, src_c14, src_c15);
59 BLASLONG k, pref_offset;
61 v4f32 src_a0, src_a1, src_bb0, src_bb1;
63 pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1);
67 pref_offset = L1_DATA_LINESIZE - pref_offset;
68 pref_offset = pref_offset / sizeof(FLOAT);
71 pa0_pref = a + pref_offset;
73 for (k = 0; k < (bk >> 1); k++)
75 PREF_OFFSET(pa0_pref, 64);
76 PREF_OFFSET(pa0_pref, 96);
78 LD_SP2_INC(a, 4, src_a0, src_a1);
79 LD_SP2_INC(b, 4, src_bb0, src_bb1);
81 SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
82 src_c0 -= src_a0 * src_b0;
83 src_c1 -= src_a1 * src_b0;
84 src_c2 -= src_a0 * src_b1;
85 src_c3 -= src_a1 * src_b1;
86 src_c4 -= src_a0 * src_b2;
87 src_c5 -= src_a1 * src_b2;
88 src_c6 -= src_a0 * src_b3;
89 src_c7 -= src_a1 * src_b3;
91 SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
92 src_c8 -= src_a0 * src_b0;
93 src_c9 -= src_a1 * src_b0;
94 src_c10 -= src_a0 * src_b1;
95 src_c11 -= src_a1 * src_b1;
96 src_c12 -= src_a0 * src_b2;
97 src_c13 -= src_a1 * src_b2;
98 src_c14 -= src_a0 * src_b3;
99 src_c15 -= src_a1 * src_b3;
101 LD_SP2_INC(a, 4, src_a0, src_a1);
102 LD_SP2_INC(b, 4, src_bb0, src_bb1);
104 SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
105 src_c0 -= src_a0 * src_b0;
106 src_c1 -= src_a1 * src_b0;
107 src_c2 -= src_a0 * src_b1;
108 src_c3 -= src_a1 * src_b1;
109 src_c4 -= src_a0 * src_b2;
110 src_c5 -= src_a1 * src_b2;
111 src_c6 -= src_a0 * src_b3;
112 src_c7 -= src_a1 * src_b3;
114 SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
115 src_c8 -= src_a0 * src_b0;
116 src_c9 -= src_a1 * src_b0;
117 src_c10 -= src_a0 * src_b1;
118 src_c11 -= src_a1 * src_b1;
119 src_c12 -= src_a0 * src_b2;
120 src_c13 -= src_a1 * src_b2;
121 src_c14 -= src_a0 * src_b3;
122 src_c15 -= src_a1 * src_b3;
129 LD_SP2_INC(a, 4, src_a0, src_a1);
130 LD_SP2_INC(b, 4, src_bb0, src_bb1);
132 SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
133 src_c0 -= src_a0 * src_b0;
134 src_c1 -= src_a1 * src_b0;
135 src_c2 -= src_a0 * src_b1;
136 src_c3 -= src_a1 * src_b1;
137 src_c4 -= src_a0 * src_b2;
138 src_c5 -= src_a1 * src_b2;
139 src_c6 -= src_a0 * src_b3;
140 src_c7 -= src_a1 * src_b3;
142 SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
143 src_c8 -= src_a0 * src_b0;
144 src_c9 -= src_a1 * src_b0;
145 src_c10 -= src_a0 * src_b1;
146 src_c11 -= src_a1 * src_b1;
147 src_c12 -= src_a0 * src_b2;
148 src_c13 -= src_a1 * src_b2;
149 src_c14 -= src_a0 * src_b3;
150 src_c15 -= src_a1 * src_b3;
154 src_b = LD_SP(b + 0);
155 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
156 src_b = LD_SP(b + 4);
157 SPLATI_W4_SP(src_b, src_b4, src_b5, src_b6, src_b7);
159 src_b = LD_SP(b + 9);
160 SPLATI_W4_SP(src_b, src_b9, src_b10, src_b11, src_b12);
161 src_b13 = LD_SP(b + 13);
162 src_b15 = (v4f32) __msa_splati_w((v4i32) src_b13, 2);
163 src_b14 = (v4f32) __msa_splati_w((v4i32) src_b13, 1);
164 src_b13 = (v4f32) __msa_splati_w((v4i32) src_b13, 0);
168 src_c2 -= src_c0 * src_b1;
169 src_c3 -= src_c1 * src_b1;
170 src_c4 -= src_c0 * src_b2;
171 src_c5 -= src_c1 * src_b2;
172 src_c6 -= src_c0 * src_b3;
173 src_c7 -= src_c1 * src_b3;
174 src_c8 -= src_c0 * src_b4;
175 src_c9 -= src_c1 * src_b4;
176 src_c10 -= src_c0 * src_b5;
177 src_c11 -= src_c1 * src_b5;
178 src_c12 -= src_c0 * src_b6;
179 src_c13 -= src_c1 * src_b6;
180 src_c14 -= src_c0 * src_b7;
181 src_c15 -= src_c1 * src_b7;
183 ST_SP2(src_c0, src_c1, a, 4);
184 ST_SP2(src_c0, src_c1, c, 4);
188 src_c4 -= src_c2 * src_b10;
189 src_c5 -= src_c3 * src_b10;
190 src_c6 -= src_c2 * src_b11;
191 src_c7 -= src_c3 * src_b11;
192 src_c8 -= src_c2 * src_b12;
193 src_c9 -= src_c3 * src_b12;
194 src_c10 -= src_c2 * src_b13;
195 src_c11 -= src_c3 * src_b13;
196 src_c12 -= src_c2 * src_b14;
197 src_c13 -= src_c3 * src_b14;
198 src_c14 -= src_c2 * src_b15;
199 src_c15 -= src_c3 * src_b15;
201 ST_SP2(src_c2, src_c3, a + 8, 4);
202 ST_SP2(src_c2, src_c3, c_nxt1line, 4);
204 src_b = LD_SP(b + 18);
205 SPLATI_W4_SP(src_b, src_b18, src_b19, src_b20, src_b21);
206 src_b22 = LD_SP(b + 22);
207 src_b23 = (v4f32) __msa_splati_w((v4i32) src_b22, 1);
208 src_b22 = (v4f32) __msa_splati_w((v4i32) src_b22, 0);
210 src_b = LD_SP(b + 27);
211 SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30);
212 src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31));
216 src_c6 -= src_c4 * src_b19;
217 src_c7 -= src_c5 * src_b19;
218 src_c8 -= src_c4 * src_b20;
219 src_c9 -= src_c5 * src_b20;
220 src_c10 -= src_c4 * src_b21;
221 src_c11 -= src_c5 * src_b21;
222 src_c12 -= src_c4 * src_b22;
223 src_c13 -= src_c5 * src_b22;
224 src_c14 -= src_c4 * src_b23;
225 src_c15 -= src_c5 * src_b23;
227 ST_SP2(src_c4, src_c5, a + 16, 4);
228 ST_SP2(src_c4, src_c5, c_nxt2line, 4);
232 src_c8 -= src_c6 * src_b28;
233 src_c9 -= src_c7 * src_b28;
234 src_c10 -= src_c6 * src_b29;
235 src_c11 -= src_c7 * src_b29;
236 src_c12 -= src_c6 * src_b30;
237 src_c13 -= src_c7 * src_b30;
238 src_c14 -= src_c6 * src_b31;
239 src_c15 -= src_c7 * src_b31;
241 ST_SP2(src_c6, src_c7, a + 24, 4);
242 ST_SP2(src_c6, src_c7, c_nxt3line, 4);
244 src_b = LD_SP(b + 36);
245 SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39);
247 src_b45 = LD_SP(b + 45);
248 src_b47 = (v4f32) __msa_splati_w((v4i32) src_b45, 2);
249 src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1);
250 src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0);
252 src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54));
253 src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55));
254 src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63));
258 src_c10 -= src_c8 * src_b37;
259 src_c11 -= src_c9 * src_b37;
260 src_c12 -= src_c8 * src_b38;
261 src_c13 -= src_c9 * src_b38;
262 src_c14 -= src_c8 * src_b39;
263 src_c15 -= src_c9 * src_b39;
265 ST_SP2(src_c8, src_c9, a + 32, 4);
266 ST_SP2(src_c8, src_c9, c_nxt4line, 4);
270 src_c12 -= src_c10 * src_b46;
271 src_c13 -= src_c11 * src_b46;
272 src_c14 -= src_c10 * src_b47;
273 src_c15 -= src_c11 * src_b47;
275 ST_SP2(src_c10, src_c11, a + 40, 4);
276 ST_SP2(src_c10, src_c11, c_nxt5line, 4);
280 src_c14 -= src_c12 * src_b55;
281 src_c15 -= src_c13 * src_b55;
283 ST_SP2(src_c12, src_c13, a + 48, 4);
284 ST_SP2(src_c12, src_c13, c_nxt6line, 4);
289 ST_SP2(src_c14, src_c15, a + 56, 4);
290 ST_SP2(src_c14, src_c15, c_nxt7line, 4);
293 static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
296 v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
297 v4f32 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7;
298 v4f32 src_b10, src_b11, src_b15, src_b, src_a0, src_a1;
299 FLOAT *c_nxt1line = c + ldc;
300 FLOAT *c_nxt2line = c + 2 * ldc;
301 FLOAT *c_nxt3line = c + 3 * ldc;
303 LD_SP2(c, 4, src_c0, src_c1);
304 LD_SP2(c_nxt1line, 4, src_c2, src_c3);
305 LD_SP2(c_nxt2line, 4, src_c4, src_c5);
306 LD_SP2(c_nxt3line, 4, src_c6, src_c7);
308 for (k = 0; k < (bk >> 1); k++)
310 LD_SP2(a, 4, src_a0, src_a1);
312 src_b = LD_SP(b + 0);
313 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
314 src_c0 -= src_a0 * src_b0;
315 src_c1 -= src_a1 * src_b0;
316 src_c2 -= src_a0 * src_b1;
317 src_c3 -= src_a1 * src_b1;
318 src_c4 -= src_a0 * src_b2;
319 src_c5 -= src_a1 * src_b2;
320 src_c6 -= src_a0 * src_b3;
321 src_c7 -= src_a1 * src_b3;
326 LD_SP2(a, 4, src_a0, src_a1);
328 src_b = LD_SP(b + 0);
329 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
330 src_c0 -= src_a0 * src_b0;
331 src_c1 -= src_a1 * src_b0;
332 src_c2 -= src_a0 * src_b1;
333 src_c3 -= src_a1 * src_b1;
334 src_c4 -= src_a0 * src_b2;
335 src_c5 -= src_a1 * src_b2;
336 src_c6 -= src_a0 * src_b3;
337 src_c7 -= src_a1 * src_b3;
343 if ((bk & 1) && (bk > 0))
345 LD_SP2(a, 4, src_a0, src_a1);
347 src_b = LD_SP(b + 0);
348 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
349 src_c0 -= src_a0 * src_b0;
350 src_c1 -= src_a1 * src_b0;
351 src_c2 -= src_a0 * src_b1;
352 src_c3 -= src_a1 * src_b1;
353 src_c4 -= src_a0 * src_b2;
354 src_c5 -= src_a1 * src_b2;
355 src_c6 -= src_a0 * src_b3;
356 src_c7 -= src_a1 * src_b3;
362 src_b = LD_SP(b + 0);
363 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
364 src_b5 = LD_SP(b + 5);
365 src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2);
366 src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1);
367 src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0);
368 src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10));
369 src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11));
370 src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15));
374 src_c2 -= src_c0 * src_b1;
375 src_c3 -= src_c1 * src_b1;
376 src_c4 -= src_c0 * src_b2;
377 src_c5 -= src_c1 * src_b2;
378 src_c6 -= src_c0 * src_b3;
379 src_c7 -= src_c1 * src_b3;
383 src_c4 -= src_c2 * src_b6;
384 src_c5 -= src_c3 * src_b6;
385 src_c6 -= src_c2 * src_b7;
386 src_c7 -= src_c3 * src_b7;
390 src_c6 -= src_c4 * src_b11;
391 src_c7 -= src_c5 * src_b11;
396 ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
397 ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4);
399 ST_SP2(src_c0, src_c1, c, 4);
400 ST_SP2(src_c2, src_c3, c_nxt1line, 4);
401 ST_SP2(src_c4, src_c5, c_nxt2line, 4);
402 ST_SP2(src_c6, src_c7, c_nxt3line, 4);
405 static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
408 v4f32 src_a0, src_a1;
409 v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b3;
410 FLOAT *c_nxt1line = c + ldc;
412 LD_SP2(c, 4, src_c0, src_c1);
413 LD_SP2(c_nxt1line, 4, src_c2, src_c3);
415 for (k = 0; k < (bk >> 1); k++)
417 LD_SP2(a, 4, src_a0, src_a1);
419 src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
420 src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
422 src_c0 -= src_a0 * src_b0;
423 src_c1 -= src_a1 * src_b0;
424 src_c2 -= src_a0 * src_b1;
425 src_c3 -= src_a1 * src_b1;
430 LD_SP2(a, 4, src_a0, src_a1);
432 src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
433 src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
435 src_c0 -= src_a0 * src_b0;
436 src_c1 -= src_a1 * src_b0;
437 src_c2 -= src_a0 * src_b1;
438 src_c3 -= src_a1 * src_b1;
444 if ((bk & 1) && (bk > 0))
446 LD_SP2(a, 4, src_a0, src_a1);
448 src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
449 src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
451 src_c0 -= src_a0 * src_b0;
452 src_c1 -= src_a1 * src_b0;
453 src_c2 -= src_a0 * src_b1;
454 src_c3 -= src_a1 * src_b1;
460 src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
461 src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
462 src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
466 src_c2 -= src_c0 * src_b1;
467 src_c3 -= src_c1 * src_b1;
471 ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
472 ST_SP2(src_c0, src_c1, c, 4);
473 ST_SP2(src_c2, src_c3, c_nxt1line, 4);
476 static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
479 v4f32 src_a0, src_a1, src_c0, src_c1, src_b0;
481 LD_SP2(c, 4, src_c0, src_c1);
483 for (k = 0; k < (bk >> 2); k++)
485 LD_SP2(a, 4, src_a0, src_a1);
487 src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
489 src_c0 -= src_a0 * src_b0;
490 src_c1 -= src_a1 * src_b0;
495 LD_SP2(a, 4, src_a0, src_a1);
497 src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
499 src_c0 -= src_a0 * src_b0;
500 src_c1 -= src_a1 * src_b0;
505 LD_SP2(a, 4, src_a0, src_a1);
507 src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
509 src_c0 -= src_a0 * src_b0;
510 src_c1 -= src_a1 * src_b0;
515 LD_SP2(a, 4, src_a0, src_a1);
517 src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
519 src_c0 -= src_a0 * src_b0;
520 src_c1 -= src_a1 * src_b0;
526 if ((bk & 3) && (bk > 0))
530 LD_SP2(a, 4, src_a0, src_a1);
532 src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
534 src_c0 -= src_a0 * src_b0;
535 src_c1 -= src_a1 * src_b0;
540 LD_SP2(a, 4, src_a0, src_a1);
542 src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
544 src_c0 -= src_a0 * src_b0;
545 src_c1 -= src_a1 * src_b0;
553 LD_SP2(a, 4, src_a0, src_a1);
555 src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
557 src_c0 -= src_a0 * src_b0;
558 src_c1 -= src_a1 * src_b0;
565 src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
570 ST_SP2(src_c0, src_c1, a, 4);
571 ST_SP2(src_c0, src_c1, c, 4);
574 static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
577 v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
578 v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7;
579 v4f32 src_b9, src_b10, src_b11, src_b12, src_b13, src_b14, src_b15, src_b18;
580 v4f32 src_b19, src_b20, src_b21, src_b22, src_b23, src_b27, src_b28;
581 v4f32 src_b29, src_b30, src_b31, src_b36, src_b37, src_b38, src_b39;
582 v4f32 src_b45, src_b46, src_b47, src_b54, src_b55, src_b63, src_b, src_a0;
583 FLOAT *c_nxt1line = c + ldc;
584 FLOAT *c_nxt2line = c + 2 * ldc;
585 FLOAT *c_nxt3line = c + 3 * ldc;
586 FLOAT *c_nxt4line = c + 4 * ldc;
587 FLOAT *c_nxt5line = c + 5 * ldc;
588 FLOAT *c_nxt6line = c + 6 * ldc;
589 FLOAT *c_nxt7line = c + 7 * ldc;
592 src_c1 = LD_SP(c_nxt1line);
593 src_c2 = LD_SP(c_nxt2line);
594 src_c3 = LD_SP(c_nxt3line);
595 src_c4 = LD_SP(c_nxt4line);
596 src_c5 = LD_SP(c_nxt5line);
597 src_c6 = LD_SP(c_nxt6line);
598 src_c7 = LD_SP(c_nxt7line);
600 for (k = 0; k < bk; k++)
604 src_b = LD_SP(b + 0);
605 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
606 src_c0 -= src_a0 * src_b0;
607 src_c1 -= src_a0 * src_b1;
608 src_c2 -= src_a0 * src_b2;
609 src_c3 -= src_a0 * src_b3;
611 src_b = LD_SP(b + 4);
612 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
613 src_c4 -= src_a0 * src_b0;
614 src_c5 -= src_a0 * src_b1;
615 src_c6 -= src_a0 * src_b2;
616 src_c7 -= src_a0 * src_b3;
622 src_b = LD_SP(b + 0);
623 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
624 src_b = LD_SP(b + 4);
625 SPLATI_W4_SP(src_b, src_b4, src_b5, src_b6, src_b7);
627 src_b = LD_SP(b + 9);
628 SPLATI_W4_SP(src_b, src_b9, src_b10, src_b11, src_b12);
629 src_b13 = LD_SP(b + 13);
630 src_b15 = (v4f32) __msa_splati_w((v4i32) src_b13, 2);
631 src_b14 = (v4f32) __msa_splati_w((v4i32) src_b13, 1);
632 src_b13 = (v4f32) __msa_splati_w((v4i32) src_b13, 0);
634 src_b = LD_SP(b + 18);
635 SPLATI_W4_SP(src_b, src_b18, src_b19, src_b20, src_b21);
636 src_b22 = LD_SP(b + 22);
637 src_b23 = (v4f32) __msa_splati_w((v4i32) src_b22, 1);
638 src_b22 = (v4f32) __msa_splati_w((v4i32) src_b22, 0);
640 src_b = LD_SP(b + 27);
641 SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30);
642 src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31));
644 src_b = LD_SP(b + 36);
645 SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39);
647 src_b45 = LD_SP(b + 45);
648 src_b47 = (v4f32) __msa_splati_w((v4i32) src_b45, 2);
649 src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1);
650 src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0);
652 src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54));
653 src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55));
654 src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63));
657 src_c1 -= src_c0 * src_b1;
658 src_c2 -= src_c0 * src_b2;
659 src_c3 -= src_c0 * src_b3;
660 src_c4 -= src_c0 * src_b4;
661 src_c5 -= src_c0 * src_b5;
662 src_c6 -= src_c0 * src_b6;
663 src_c7 -= src_c0 * src_b7;
666 src_c2 -= src_c1 * src_b10;
667 src_c3 -= src_c1 * src_b11;
668 src_c4 -= src_c1 * src_b12;
669 src_c5 -= src_c1 * src_b13;
670 src_c6 -= src_c1 * src_b14;
671 src_c7 -= src_c1 * src_b15;
674 src_c3 -= src_c2 * src_b19;
675 src_c4 -= src_c2 * src_b20;
676 src_c5 -= src_c2 * src_b21;
677 src_c6 -= src_c2 * src_b22;
678 src_c7 -= src_c2 * src_b23;
681 src_c4 -= src_c3 * src_b28;
682 src_c5 -= src_c3 * src_b29;
683 src_c6 -= src_c3 * src_b30;
684 src_c7 -= src_c3 * src_b31;
687 src_c5 -= src_c4 * src_b37;
688 src_c6 -= src_c4 * src_b38;
689 src_c7 -= src_c4 * src_b39;
692 src_c6 -= src_c5 * src_b46;
693 src_c7 -= src_c5 * src_b47;
696 src_c7 -= src_c6 * src_b55;
700 ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
701 ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4);
704 ST_SP(src_c1, c_nxt1line);
705 ST_SP(src_c2, c_nxt2line);
706 ST_SP(src_c3, c_nxt3line);
707 ST_SP(src_c4, c_nxt4line);
708 ST_SP(src_c5, c_nxt5line);
709 ST_SP(src_c6, c_nxt6line);
710 ST_SP(src_c7, c_nxt7line);
713 static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
716 v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b2, src_b3;
717 v4f32 src_b5, src_b6, src_b7, src_b10, src_b11, src_b15, src_b, src_a0;
718 FLOAT *c_nxt1line = c + ldc;
719 FLOAT *c_nxt2line = c + 2 * ldc;
720 FLOAT *c_nxt3line = c + 3 * ldc;
723 src_c1 = LD_SP(c_nxt1line);
724 src_c2 = LD_SP(c_nxt2line);
725 src_c3 = LD_SP(c_nxt3line);
727 for (k = 0; k < (bk >> 1); k++)
731 src_b = LD_SP(b + 0);
732 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
733 src_c0 -= src_a0 * src_b0;
734 src_c1 -= src_a0 * src_b1;
735 src_c2 -= src_a0 * src_b2;
736 src_c3 -= src_a0 * src_b3;
743 src_b = LD_SP(b + 0);
744 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
745 src_c0 -= src_a0 * src_b0;
746 src_c1 -= src_a0 * src_b1;
747 src_c2 -= src_a0 * src_b2;
748 src_c3 -= src_a0 * src_b3;
754 if ((bk & 1) && (bk > 0))
758 src_b = LD_SP(b + 0);
759 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
760 src_c0 -= src_a0 * src_b0;
761 src_c1 -= src_a0 * src_b1;
762 src_c2 -= src_a0 * src_b2;
763 src_c3 -= src_a0 * src_b3;
769 src_b = LD_SP(b + 0);
770 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
771 src_b5 = LD_SP(b + 5);
772 src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2);
773 src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1);
774 src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0);
775 src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10));
776 src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11));
777 src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15));
780 src_c1 -= src_c0 * src_b1;
781 src_c2 -= src_c0 * src_b2;
782 src_c3 -= src_c0 * src_b3;
785 src_c2 -= src_c1 * src_b6;
786 src_c3 -= src_c1 * src_b7;
789 src_c3 -= src_c2 * src_b11;
793 ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
796 ST_SP(src_c1, c_nxt1line);
797 ST_SP(src_c2, c_nxt2line);
798 ST_SP(src_c3, c_nxt3line);
801 static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
804 v4f32 src_a, src_c0, src_c1, src_b0, src_b1, src_b3;
805 FLOAT *c_nxt1line = c + ldc;
808 src_c1 = LD_SP(c_nxt1line);
810 for (k = 0; k < (bk >> 2); k++)
814 src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
815 src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
817 src_c0 -= src_a * src_b0;
818 src_c1 -= src_a * src_b1;
825 src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
826 src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
828 src_c0 -= src_a * src_b0;
829 src_c1 -= src_a * src_b1;
836 src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
837 src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
839 src_c0 -= src_a * src_b0;
840 src_c1 -= src_a * src_b1;
847 src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
848 src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
850 src_c0 -= src_a * src_b0;
851 src_c1 -= src_a * src_b1;
857 if ((bk & 3) && (bk > 0))
863 src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
864 src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
866 src_c0 -= src_a * src_b0;
867 src_c1 -= src_a * src_b1;
874 src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
875 src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
877 src_c0 -= src_a * src_b0;
878 src_c1 -= src_a * src_b1;
888 src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
889 src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
891 src_c0 -= src_a * src_b0;
892 src_c1 -= src_a * src_b1;
899 src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
900 src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
901 src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
904 src_c1 -= src_c0 * src_b1;
907 ST_SP2(src_c0, src_c1, a, 4);
910 ST_SP(src_c1, c_nxt1line);
913 static void ssolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
916 FLOAT b0, c0, c1, c2, c3;
923 for (k = 0; k < bk; k++)
952 static void ssolve_2x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
955 FLOAT b0, b1, b2, b3, b4, b5, b6, b7, b9, b10, b11, b12, b13, b14, b15;
956 FLOAT b18, b19, b20, b21, b22, b23, b27, b28, b29, b30, b31;
957 FLOAT b36, b37, b38, b39, b45, b46, b47, b54, b55, b63;
958 FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
959 FLOAT c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6;
960 FLOAT c0_nxt7, c1_nxt7;
964 c0_nxt1 = *(c + 0 + 1 * ldc);
965 c1_nxt1 = *(c + 1 + 1 * ldc);
966 c0_nxt2 = *(c + 0 + 2 * ldc);
967 c1_nxt2 = *(c + 1 + 2 * ldc);
968 c0_nxt3 = *(c + 0 + 3 * ldc);
969 c1_nxt3 = *(c + 1 + 3 * ldc);
970 c0_nxt4 = *(c + 0 + 4 * ldc);
971 c1_nxt4 = *(c + 1 + 4 * ldc);
972 c0_nxt5 = *(c + 0 + 5 * ldc);
973 c1_nxt5 = *(c + 1 + 5 * ldc);
974 c0_nxt6 = *(c + 0 + 6 * ldc);
975 c1_nxt6 = *(c + 1 + 6 * ldc);
976 c0_nxt7 = *(c + 0 + 7 * ldc);
977 c1_nxt7 = *(c + 1 + 7 * ldc);
979 for (k = 0; k < bk; k++)
983 c0_nxt1 -= a[0] * b[1];
984 c1_nxt1 -= a[1] * b[1];
985 c0_nxt2 -= a[0] * b[2];
986 c1_nxt2 -= a[1] * b[2];
987 c0_nxt3 -= a[0] * b[3];
988 c1_nxt3 -= a[1] * b[3];
989 c0_nxt4 -= a[0] * b[4];
990 c1_nxt4 -= a[1] * b[4];
991 c0_nxt5 -= a[0] * b[5];
992 c1_nxt5 -= a[1] * b[5];
993 c0_nxt6 -= a[0] * b[6];
994 c1_nxt6 -= a[1] * b[6];
995 c0_nxt7 -= a[0] * b[7];
996 c1_nxt7 -= a[1] * b[7];
1066 c0_nxt2 -= c0_nxt1 * b10;
1067 c1_nxt2 -= c1_nxt1 * b10;
1069 c0_nxt3 -= c0_nxt1 * b11;
1070 c1_nxt3 -= c1_nxt1 * b11;
1072 c0_nxt4 -= c0_nxt1 * b12;
1073 c1_nxt4 -= c1_nxt1 * b12;
1075 c0_nxt5 -= c0_nxt1 * b13;
1076 c1_nxt5 -= c1_nxt1 * b13;
1078 c0_nxt6 -= c0_nxt1 * b14;
1079 c1_nxt6 -= c1_nxt1 * b14;
1081 c0_nxt7 -= c0_nxt1 * b15;
1082 c1_nxt7 -= c1_nxt1 * b15;
1087 c0_nxt3 -= c0_nxt2 * b19;
1088 c1_nxt3 -= c1_nxt2 * b19;
1090 c0_nxt4 -= c0_nxt2 * b20;
1091 c1_nxt4 -= c1_nxt2 * b20;
1093 c0_nxt5 -= c0_nxt2 * b21;
1094 c1_nxt5 -= c1_nxt2 * b21;
1096 c0_nxt6 -= c0_nxt2 * b22;
1097 c1_nxt6 -= c1_nxt2 * b22;
1099 c0_nxt7 -= c0_nxt2 * b23;
1100 c1_nxt7 -= c1_nxt2 * b23;
1105 c0_nxt4 -= c0_nxt3 * b28;
1106 c1_nxt4 -= c1_nxt3 * b28;
1108 c0_nxt5 -= c0_nxt3 * b29;
1109 c1_nxt5 -= c1_nxt3 * b29;
1111 c0_nxt6 -= c0_nxt3 * b30;
1112 c1_nxt6 -= c1_nxt3 * b30;
1114 c0_nxt7 -= c0_nxt3 * b31;
1115 c1_nxt7 -= c1_nxt3 * b31;
1120 c0_nxt5 -= c0_nxt4 * b37;
1121 c1_nxt5 -= c1_nxt4 * b37;
1123 c0_nxt6 -= c0_nxt4 * b38;
1124 c1_nxt6 -= c1_nxt4 * b38;
1126 c0_nxt7 -= c0_nxt4 * b39;
1127 c1_nxt7 -= c1_nxt4 * b39;
1132 c0_nxt6 -= c0_nxt5 * b46;
1133 c1_nxt6 -= c1_nxt5 * b46;
1135 c0_nxt7 -= c0_nxt5 * b47;
1136 c1_nxt7 -= c1_nxt5 * b47;
1141 c0_nxt7 -= c0_nxt6 * b55;
1142 c1_nxt7 -= c1_nxt6 * b55;
1157 *(a + 10) = c0_nxt5;
1158 *(a + 11) = c1_nxt5;
1159 *(a + 12) = c0_nxt6;
1160 *(a + 13) = c1_nxt6;
1161 *(a + 14) = c0_nxt7;
1162 *(a + 15) = c1_nxt7;
1166 *(c + 0 + 1 * ldc) = c0_nxt1;
1167 *(c + 1 + 1 * ldc) = c1_nxt1;
1168 *(c + 0 + 2 * ldc) = c0_nxt2;
1169 *(c + 1 + 2 * ldc) = c1_nxt2;
1170 *(c + 0 + 3 * ldc) = c0_nxt3;
1171 *(c + 1 + 3 * ldc) = c1_nxt3;
1172 *(c + 0 + 4 * ldc) = c0_nxt4;
1173 *(c + 1 + 4 * ldc) = c1_nxt4;
1174 *(c + 0 + 5 * ldc) = c0_nxt5;
1175 *(c + 1 + 5 * ldc) = c1_nxt5;
1176 *(c + 0 + 6 * ldc) = c0_nxt6;
1177 *(c + 1 + 6 * ldc) = c1_nxt6;
1178 *(c + 0 + 7 * ldc) = c0_nxt7;
1179 *(c + 1 + 7 * ldc) = c1_nxt7;
1182 static void ssolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1185 FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1;
1186 FLOAT c0_nxt1, c0_nxt2, c0_nxt3, c1_nxt1, c1_nxt2, c1_nxt3;
1190 c0_nxt1 = *(c + 0 + 1 * ldc);
1191 c1_nxt1 = *(c + 1 + 1 * ldc);
1192 c0_nxt2 = *(c + 0 + 2 * ldc);
1193 c1_nxt2 = *(c + 1 + 2 * ldc);
1194 c0_nxt3 = *(c + 0 + 3 * ldc);
1195 c1_nxt3 = *(c + 1 + 3 * ldc);
1197 for (k = 0; k < bk; k++)
1201 c0_nxt1 -= a[0] * b[1];
1202 c1_nxt1 -= a[1] * b[1];
1203 c0_nxt2 -= a[0] * b[2];
1204 c1_nxt2 -= a[1] * b[2];
1205 c0_nxt3 -= a[0] * b[3];
1206 c1_nxt3 -= a[1] * b[3];
1233 c0_nxt2 -= c0_nxt1 * b6;
1234 c1_nxt2 -= c1_nxt1 * b6;
1240 c0_nxt3 -= c0_nxt1 * b7;
1241 c1_nxt3 -= c1_nxt1 * b7;
1242 c0_nxt3 -= c0_nxt2 * b11;
1243 c1_nxt3 -= c1_nxt2 * b11;
1258 *(c + 1 * ldc) = c0_nxt1;
1259 *(c + 1 + 1 * ldc) = c1_nxt1;
1260 *(c + 2 * ldc) = c0_nxt2;
1261 *(c + 1 + 2 * ldc) = c1_nxt2;
1262 *(c + 3 * ldc) = c0_nxt3;
1263 *(c + 1 + 3 * ldc) = c1_nxt3;
1266 static void ssolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1269 FLOAT b0, b1, b3, c0, c0_nxt, c1, c1_nxt;
1273 c0_nxt = *(c + 0 + ldc);
1274 c1_nxt = *(c + 1 + ldc);
1276 for (k = 0; k < bk; k++)
1280 c0_nxt -= a[0] * b[1];
1281 c1_nxt -= a[1] * b[1];
1307 *(c + ldc) = c0_nxt;
1308 *(c + 1 + ldc) = c1_nxt;
1311 static void ssolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1319 for (k = 0; k < bk; k++)
1340 static void ssolve_1x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1343 FLOAT b0, b1, b2, b3, b4, b5, b6, b7, b9, b10, b11, b12, b13, b14, b15;
1344 FLOAT b18, b19, b20, b21, b22, b23, b27, b28, b29, b30, b31, b36, b37, b38;
1345 FLOAT b39, b45, b46, b47, b54, b55, b63, c0, c1, c2, c3, c4, c5, c6, c7;
1348 c1 = *(c + 1 * ldc);
1349 c2 = *(c + 2 * ldc);
1350 c3 = *(c + 3 * ldc);
1351 c4 = *(c + 4 * ldc);
1352 c5 = *(c + 5 * ldc);
1353 c6 = *(c + 6 * ldc);
1354 c7 = *(c + 7 * ldc);
1356 for (k = 0; k < bk; k++)
1462 *(c + 1 * ldc) = c1;
1463 *(c + 2 * ldc) = c2;
1464 *(c + 3 * ldc) = c3;
1465 *(c + 4 * ldc) = c4;
1466 *(c + 5 * ldc) = c5;
1467 *(c + 6 * ldc) = c6;
1468 *(c + 7 * ldc) = c7;
1471 static void ssolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1474 FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1, c2, c3;
1477 c1 = *(c + 1 * ldc);
1478 c2 = *(c + 2 * ldc);
1479 c3 = *(c + 3 * ldc);
1481 for (k = 0; k < bk; k++)
1523 *(c + 1 * ldc) = c1;
1524 *(c + 2 * ldc) = c2;
1525 *(c + 3 * ldc) = c3;
1528 static void ssolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1531 FLOAT b0, b1, b3, c0, c1;
1536 for (k = 0; k < bk; k++)
1561 static void ssolve_1x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
1565 for (k = 0; k < bk; k++)
1577 int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
1578 FLOAT *c, BLASLONG ldc, BLASLONG offset)
1585 for (j = (n >> 3); j--;)
1590 for (i = (m >> 3); i--;)
1592 ssolve_8x8_rn_msa(aa, b, cc, ldc, kk);
1602 ssolve_4x8_rn_msa(aa, b, cc, ldc, kk);
1610 ssolve_2x8_rn_msa(aa, b, cc, ldc, kk);
1618 ssolve_1x8_rn_msa(aa, b, cc, ldc, kk);
1637 for (i = (m >> 3); i--;)
1639 ssolve_8x4_rn_msa(aa, b, cc, ldc, kk);
1649 ssolve_4x4_rn_msa(aa, b, cc, ldc, kk);
1657 ssolve_2x4_rn_msa(aa, b, cc, ldc, kk);
1665 ssolve_1x4_rn_msa(aa, b, cc, ldc, kk);
1682 for (i = (m >> 3); i--;)
1684 ssolve_8x2_rn_msa(aa, b, cc, ldc, kk);
1694 ssolve_4x2_rn_msa(aa, b, cc, ldc, kk);
1702 ssolve_2x2_rn_msa(aa, b, cc, ldc, kk);
1710 ssolve_1x2_rn_msa(aa, b, cc, ldc, kk);
1727 for (i = (m >> 3); i--;)
1729 ssolve_8x1_rn_msa(aa, b, cc, ldc, kk);
1739 ssolve_4x1_rn_msa(aa, b, cc, ldc, kk);
1747 ssolve_2x1_rn_msa(aa, b, cc, ldc, kk);
1755 ssolve_1x1_rn_msa(aa, b, cc, kk);