1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
33 v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
34 v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
35 v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
36 v4f32 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15;
37 v4f32 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
38 v4f32 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18;
39 v4f32 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28;
40 v4f32 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39;
41 v4f32 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63, src_a;
42 FLOAT *c_nxt1line = c + ldc;
43 FLOAT *c_nxt2line = c + 2 * ldc;
44 FLOAT *c_nxt3line = c + 3 * ldc;
45 FLOAT *c_nxt4line = c + 4 * ldc;
46 FLOAT *c_nxt5line = c + 5 * ldc;
47 FLOAT *c_nxt6line = c + 6 * ldc;
48 FLOAT *c_nxt7line = c + 7 * ldc;
50 LD_SP2(c, 4, src_c0, src_c1);
51 LD_SP2(c_nxt1line, 4, src_c2, src_c3);
52 LD_SP2(c_nxt2line, 4, src_c4, src_c5);
53 LD_SP2(c_nxt3line, 4, src_c6, src_c7);
54 LD_SP2(c_nxt4line, 4, src_c8, src_c9);
55 LD_SP2(c_nxt5line, 4, src_c10, src_c11);
56 LD_SP2(c_nxt6line, 4, src_c12, src_c13);
57 LD_SP2(c_nxt7line, 4, src_c14, src_c15);
61 BLASLONG k, pref_offset;
63 v4f32 src_b0, src_b1, src_b2, src_b3, src_bb0, src_bb1;
65 pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1);
69 pref_offset = L1_DATA_LINESIZE - pref_offset;
70 pref_offset = pref_offset / sizeof(FLOAT);
73 pa0_pref = a + pref_offset;
75 for (k = 0; k < (bk >> 1); k++)
77 PREF_OFFSET(pa0_pref, 64);
78 PREF_OFFSET(pa0_pref, 96);
80 LD_SP2_INC(a, 4, src_a0, src_a1);
81 LD_SP2_INC(b, 4, src_bb0, src_bb1);
83 SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
84 src_c0 -= src_a0 * src_b0;
85 src_c1 -= src_a1 * src_b0;
86 src_c2 -= src_a0 * src_b1;
87 src_c3 -= src_a1 * src_b1;
88 src_c4 -= src_a0 * src_b2;
89 src_c5 -= src_a1 * src_b2;
90 src_c6 -= src_a0 * src_b3;
91 src_c7 -= src_a1 * src_b3;
93 SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
94 src_c8 -= src_a0 * src_b0;
95 src_c9 -= src_a1 * src_b0;
96 src_c10 -= src_a0 * src_b1;
97 src_c11 -= src_a1 * src_b1;
98 src_c12 -= src_a0 * src_b2;
99 src_c13 -= src_a1 * src_b2;
100 src_c14 -= src_a0 * src_b3;
101 src_c15 -= src_a1 * src_b3;
103 LD_SP2_INC(a, 4, src_a0, src_a1);
104 LD_SP2_INC(b, 4, src_bb0, src_bb1);
106 SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
107 src_c0 -= src_a0 * src_b0;
108 src_c1 -= src_a1 * src_b0;
109 src_c2 -= src_a0 * src_b1;
110 src_c3 -= src_a1 * src_b1;
111 src_c4 -= src_a0 * src_b2;
112 src_c5 -= src_a1 * src_b2;
113 src_c6 -= src_a0 * src_b3;
114 src_c7 -= src_a1 * src_b3;
116 SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
117 src_c8 -= src_a0 * src_b0;
118 src_c9 -= src_a1 * src_b0;
119 src_c10 -= src_a0 * src_b1;
120 src_c11 -= src_a1 * src_b1;
121 src_c12 -= src_a0 * src_b2;
122 src_c13 -= src_a1 * src_b2;
123 src_c14 -= src_a0 * src_b3;
124 src_c15 -= src_a1 * src_b3;
131 LD_SP2_INC(a, 4, src_a0, src_a1);
132 LD_SP2_INC(b, 4, src_bb0, src_bb1);
134 SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
135 src_c0 -= src_a0 * src_b0;
136 src_c1 -= src_a1 * src_b0;
137 src_c2 -= src_a0 * src_b1;
138 src_c3 -= src_a1 * src_b1;
139 src_c4 -= src_a0 * src_b2;
140 src_c5 -= src_a1 * src_b2;
141 src_c6 -= src_a0 * src_b3;
142 src_c7 -= src_a1 * src_b3;
144 SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
145 src_c8 -= src_a0 * src_b0;
146 src_c9 -= src_a1 * src_b0;
147 src_c10 -= src_a0 * src_b1;
148 src_c11 -= src_a1 * src_b1;
149 src_c12 -= src_a0 * src_b2;
150 src_c13 -= src_a1 * src_b2;
151 src_c14 -= src_a0 * src_b3;
152 src_c15 -= src_a1 * src_b3;
156 TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6,
157 res_c0, res_c1, res_c2, res_c3);
158 TRANSPOSE4x4_SP_SP(src_c8, src_c10, src_c12, src_c14,
159 res_c8, res_c9, res_c10, res_c11);
160 TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7,
161 res_c4, res_c5, res_c6, res_c7);
162 TRANSPOSE4x4_SP_SP(src_c9, src_c11, src_c13, src_c15,
163 res_c12, res_c13, res_c14, res_c15);
165 src_a = LD_SP(a + 0);
166 SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3);
167 src_a = LD_SP(a + 4);
168 SPLATI_W4_SP(src_a, src_a4, src_a5, src_a6, src_a7);
172 res_c1 -= res_c0 * src_a1;
173 res_c9 -= res_c8 * src_a1;
174 res_c2 -= res_c0 * src_a2;
175 res_c10 -= res_c8 * src_a2;
176 res_c3 -= res_c0 * src_a3;
177 res_c11 -= res_c8 * src_a3;
178 res_c4 -= res_c0 * src_a4;
179 res_c12 -= res_c8 * src_a4;
180 res_c5 -= res_c0 * src_a5;
181 res_c13 -= res_c8 * src_a5;
182 res_c6 -= res_c0 * src_a6;
183 res_c14 -= res_c8 * src_a6;
184 res_c7 -= res_c0 * src_a7;
185 res_c15 -= res_c8 * src_a7;
187 src_a = LD_SP(a + 9);
188 SPLATI_W4_SP(src_a, src_a9, src_a10, src_a11, src_a12);
189 src_a13 = LD_SP(a + 13);
190 src_a15 = (v4f32) __msa_splati_w((v4i32) src_a13, 2);
191 src_a14 = (v4f32) __msa_splati_w((v4i32) src_a13, 1);
192 src_a13 = (v4f32) __msa_splati_w((v4i32) src_a13, 0);
196 res_c2 -= res_c1 * src_a10;
197 res_c10 -= res_c9 * src_a10;
198 res_c3 -= res_c1 * src_a11;
199 res_c11 -= res_c9 * src_a11;
200 res_c4 -= res_c1 * src_a12;
201 res_c12 -= res_c9 * src_a12;
202 res_c5 -= res_c1 * src_a13;
203 res_c13 -= res_c9 * src_a13;
204 res_c6 -= res_c1 * src_a14;
205 res_c14 -= res_c9 * src_a14;
206 res_c7 -= res_c1 * src_a15;
207 res_c15 -= res_c9 * src_a15;
209 src_a = LD_SP(a + 18);
210 SPLATI_W4_SP(src_a, src_a18, src_a19, src_a20, src_a21);
211 src_a22 = LD_SP(a + 22);
212 src_a23 = (v4f32) __msa_splati_w((v4i32) src_a22, 1);
213 src_a22 = (v4f32) __msa_splati_w((v4i32) src_a22, 0);
217 res_c3 -= res_c2 * src_a19;
218 res_c11 -= res_c10 * src_a19;
219 res_c4 -= res_c2 * src_a20;
220 res_c12 -= res_c10 * src_a20;
221 res_c5 -= res_c2 * src_a21;
222 res_c13 -= res_c10 * src_a21;
223 res_c6 -= res_c2 * src_a22;
224 res_c14 -= res_c10 * src_a22;
225 res_c7 -= res_c2 * src_a23;
226 res_c15 -= res_c10 * src_a23;
228 src_a = LD_SP(a + 27);
229 SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30);
230 src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31));
234 res_c4 -= res_c3 * src_a28;
235 res_c12 -= res_c11 * src_a28;
236 res_c5 -= res_c3 * src_a29;
237 res_c13 -= res_c11 * src_a29;
238 res_c6 -= res_c3 * src_a30;
239 res_c14 -= res_c11 * src_a30;
240 res_c7 -= res_c3 * src_a31;
241 res_c15 -= res_c11 * src_a31;
243 ST_SP4(res_c0, res_c8, res_c1, res_c9, b, 4);
244 ST_SP4(res_c2, res_c10, res_c3, res_c11, b + 16, 4);
246 TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
247 src_c0, src_c2, src_c4, src_c6);
248 TRANSPOSE4x4_SP_SP(res_c8, res_c9, res_c10, res_c11,
249 src_c8, src_c10, src_c12, src_c14);
252 ST_SP(src_c2, c_nxt1line);
253 ST_SP(src_c4, c_nxt2line);
254 ST_SP(src_c6, c_nxt3line);
255 ST_SP(src_c8, c_nxt4line);
256 ST_SP(src_c10, c_nxt5line);
257 ST_SP(src_c12, c_nxt6line);
258 ST_SP(src_c14, c_nxt7line);
260 src_a = LD_SP(a + 36);
261 SPLATI_W4_SP(src_a, src_a36, src_a37, src_a38, src_a39);
265 res_c5 -= res_c4 * src_a37;
266 res_c13 -= res_c12 * src_a37;
267 res_c6 -= res_c4 * src_a38;
268 res_c14 -= res_c12 * src_a38;
269 res_c7 -= res_c4 * src_a39;
270 res_c15 -= res_c12 * src_a39;
272 src_a45 = LD_SP(a + 45);
273 src_a47 = (v4f32) __msa_splati_w((v4i32) src_a45, 2);
274 src_a46 = (v4f32) __msa_splati_w((v4i32) src_a45, 1);
275 src_a45 = (v4f32) __msa_splati_w((v4i32) src_a45, 0);
279 res_c6 -= res_c5 * src_a46;
280 res_c14 -= res_c13 * src_a46;
281 res_c7 -= res_c5 * src_a47;
282 res_c15 -= res_c13 * src_a47;
284 src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54));
285 src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55));
286 src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63));
290 res_c7 -= res_c6 * src_a55;
291 res_c15 -= res_c14 * src_a55;
296 ST_SP4(res_c4, res_c12, res_c5, res_c13, b + 32, 4);
297 ST_SP4(res_c6, res_c14, res_c7, res_c15, b + 48, 4);
299 TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7,
300 src_c1, src_c3, src_c5, src_c7);
301 TRANSPOSE4x4_SP_SP(res_c12, res_c13, res_c14, res_c15,
302 src_c9, src_c11, src_c13, src_c15);
304 ST_SP(src_c1, c + 4);
305 ST_SP(src_c3, c_nxt1line + 4);
306 ST_SP(src_c5, c_nxt2line + 4);
307 ST_SP(src_c7, c_nxt3line + 4);
308 ST_SP(src_c9, c_nxt4line + 4);
309 ST_SP(src_c11, c_nxt5line + 4);
310 ST_SP(src_c13, c_nxt6line + 4);
311 ST_SP(src_c15, c_nxt7line + 4);
314 static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
317 v4f32 src_b, src_b0, src_b1, src_b2, src_b3;
318 v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
319 v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
320 v4f32 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
321 v4f32 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18;
322 v4f32 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28;
323 v4f32 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39;
324 v4f32 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63, src_a;
325 FLOAT *c_nxt1line = c + ldc;
326 FLOAT *c_nxt2line = c + 2 * ldc;
327 FLOAT *c_nxt3line = c + 3 * ldc;
329 LD_SP2(c, 4, src_c0, src_c1);
330 LD_SP2(c_nxt1line, 4, src_c2, src_c3);
331 LD_SP2(c_nxt2line, 4, src_c4, src_c5);
332 LD_SP2(c_nxt3line, 4, src_c6, src_c7);
334 for (k = 0; k < bk; k++)
336 LD_SP2(a, 4, src_a0, src_a1);
338 src_b = LD_SP(b + 0);
339 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
340 src_c0 -= src_a0 * src_b0;
341 src_c1 -= src_a1 * src_b0;
342 src_c2 -= src_a0 * src_b1;
343 src_c3 -= src_a1 * src_b1;
344 src_c4 -= src_a0 * src_b2;
345 src_c5 -= src_a1 * src_b2;
346 src_c6 -= src_a0 * src_b3;
347 src_c7 -= src_a1 * src_b3;
353 TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6,
354 res_c0, res_c1, res_c2, res_c3);
355 TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7,
356 res_c4, res_c5, res_c6, res_c7);
358 src_a = LD_SP(a + 0);
359 SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3);
360 src_a = LD_SP(a + 4);
361 SPLATI_W4_SP(src_a, src_a4, src_a5, src_a6, src_a7);
364 res_c1 -= res_c0 * src_a1;
365 res_c2 -= res_c0 * src_a2;
366 res_c3 -= res_c0 * src_a3;
367 res_c4 -= res_c0 * src_a4;
368 res_c5 -= res_c0 * src_a5;
369 res_c6 -= res_c0 * src_a6;
370 res_c7 -= res_c0 * src_a7;
372 src_a = LD_SP(a + 9);
373 SPLATI_W4_SP(src_a, src_a9, src_a10, src_a11, src_a12);
374 src_a13 = LD_SP(a + 13);
375 src_a15 = (v4f32) __msa_splati_w((v4i32) src_a13, 2);
376 src_a14 = (v4f32) __msa_splati_w((v4i32) src_a13, 1);
377 src_a13 = (v4f32) __msa_splati_w((v4i32) src_a13, 0);
380 res_c2 -= res_c1 * src_a10;
381 res_c3 -= res_c1 * src_a11;
382 res_c4 -= res_c1 * src_a12;
383 res_c5 -= res_c1 * src_a13;
384 res_c6 -= res_c1 * src_a14;
385 res_c7 -= res_c1 * src_a15;
387 src_a = LD_SP(a + 18);
388 SPLATI_W4_SP(src_a, src_a18, src_a19, src_a20, src_a21);
389 src_a22 = LD_SP(a + 22);
390 src_a23 = (v4f32) __msa_splati_w((v4i32) src_a22, 1);
391 src_a22 = (v4f32) __msa_splati_w((v4i32) src_a22, 0);
394 res_c3 -= res_c2 * src_a19;
395 res_c4 -= res_c2 * src_a20;
396 res_c5 -= res_c2 * src_a21;
397 res_c6 -= res_c2 * src_a22;
398 res_c7 -= res_c2 * src_a23;
400 src_a = LD_SP(a + 27);
401 SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30);
402 src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31));
405 res_c4 -= res_c3 * src_a28;
406 res_c5 -= res_c3 * src_a29;
407 res_c6 -= res_c3 * src_a30;
408 res_c7 -= res_c3 * src_a31;
410 src_a = LD_SP(a + 36);
411 SPLATI_W4_SP(src_a, src_a36, src_a37, src_a38, src_a39);
414 res_c5 -= res_c4 * src_a37;
415 res_c6 -= res_c4 * src_a38;
416 res_c7 -= res_c4 * src_a39;
418 src_a45 = LD_SP(a + 45);
419 src_a47 = (v4f32) __msa_splati_w((v4i32) src_a45, 2);
420 src_a46 = (v4f32) __msa_splati_w((v4i32) src_a45, 1);
421 src_a45 = (v4f32) __msa_splati_w((v4i32) src_a45, 0);
424 res_c6 -= res_c5 * src_a46;
425 res_c7 -= res_c5 * src_a47;
427 src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54));
428 src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55));
429 src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63));
432 res_c7 -= res_c6 * src_a55;
435 ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4);
437 ST_SP4(res_c4, res_c5, res_c6, res_c7, b, 4);
439 TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
440 src_c0, src_c2, src_c4, src_c6);
441 TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7,
442 src_c1, src_c3, src_c5, src_c7);
444 ST_SP2(src_c0, src_c1, c, 4);
445 ST_SP2(src_c2, src_c3, c_nxt1line, 4);
446 ST_SP2(src_c4, src_c5, c_nxt2line, 4);
447 ST_SP2(src_c6, src_c7, c_nxt3line, 4);
450 static void ssolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
453 FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18;
454 FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39;
455 FLOAT a45, a46, a47, a54, a55, a63;
456 FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
457 FLOAT c0_nxt, c1_nxt, c2_nxt, c3_nxt, c4_nxt, c5_nxt, c6_nxt, c7_nxt;
467 c0_nxt = *(c + 0 + ldc);
468 c1_nxt = *(c + 1 + ldc);
469 c2_nxt = *(c + 2 + ldc);
470 c3_nxt = *(c + 3 + ldc);
471 c4_nxt = *(c + 4 + ldc);
472 c5_nxt = *(c + 5 + ldc);
473 c6_nxt = *(c + 6 + ldc);
474 c7_nxt = *(c + 7 + ldc);
476 for (k = 0; k < bk; k++)
486 c0_nxt -= a[0] * b[1];
487 c1_nxt -= a[1] * b[1];
488 c2_nxt -= a[2] * b[1];
489 c3_nxt -= a[3] * b[1];
490 c4_nxt -= a[4] * b[1];
491 c5_nxt -= a[5] * b[1];
492 c6_nxt -= a[6] * b[1];
493 c7_nxt -= a[7] * b[1];
540 c1_nxt -= c0_nxt * a1;
545 c2_nxt -= c0_nxt * a2;
547 c2_nxt -= c1_nxt * a10;
552 c3_nxt -= c0_nxt * a3;
554 c3_nxt -= c1_nxt * a11;
556 c3_nxt -= c2_nxt * a19;
561 c4_nxt -= c0_nxt * a4;
563 c4_nxt -= c1_nxt * a12;
565 c4_nxt -= c2_nxt * a20;
567 c4_nxt -= c3_nxt * a28;
572 c5_nxt -= c0_nxt * a5;
574 c5_nxt -= c1_nxt * a13;
576 c5_nxt -= c2_nxt * a21;
578 c5_nxt -= c3_nxt * a29;
580 c5_nxt -= c4_nxt * a37;
585 c6_nxt -= c0_nxt * a6;
587 c6_nxt -= c1_nxt * a14;
589 c6_nxt -= c2_nxt * a22;
591 c6_nxt -= c3_nxt * a30;
593 c6_nxt -= c4_nxt * a38;
595 c6_nxt -= c5_nxt * a46;
600 c7_nxt -= c0_nxt * a7;
602 c7_nxt -= c1_nxt * a15;
604 c7_nxt -= c2_nxt * a23;
606 c7_nxt -= c3_nxt * a31;
608 c7_nxt -= c4_nxt * a39;
610 c7_nxt -= c5_nxt * a47;
612 c7_nxt -= c6_nxt * a55;
624 *(c + 0 + ldc) = c0_nxt;
625 *(c + 1 + ldc) = c1_nxt;
626 *(c + 2 + ldc) = c2_nxt;
627 *(c + 3 + ldc) = c3_nxt;
628 *(c + 4 + ldc) = c4_nxt;
629 *(c + 5 + ldc) = c5_nxt;
630 *(c + 6 + ldc) = c6_nxt;
631 *(c + 7 + ldc) = c7_nxt;
651 static void ssolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
654 FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18;
655 FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39;
656 FLOAT a45, a46, a47, a54, a55, a63, c0, c1, c2, c3, c4, c5, c6, c7;
667 for (k = 0; k < bk; k++)
782 static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
785 v4f32 src_b, src_b0, src_b1, src_b2, src_b3;
786 v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
787 v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
788 v4f32 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7;
789 v4f32 src_a10, src_a11, src_a15, src_a;
790 FLOAT *c_nxt1line = c + ldc;
791 FLOAT *c_nxt2line = c + 2 * ldc;
792 FLOAT *c_nxt3line = c + 3 * ldc;
793 FLOAT *c_nxt4line = c + 4 * ldc;
794 FLOAT *c_nxt5line = c + 5 * ldc;
795 FLOAT *c_nxt6line = c + 6 * ldc;
796 FLOAT *c_nxt7line = c + 7 * ldc;
799 src_c1 = LD_SP(c_nxt1line);
800 src_c2 = LD_SP(c_nxt2line);
801 src_c3 = LD_SP(c_nxt3line);
802 src_c4 = LD_SP(c_nxt4line);
803 src_c5 = LD_SP(c_nxt5line);
804 src_c6 = LD_SP(c_nxt6line);
805 src_c7 = LD_SP(c_nxt7line);
807 for (k = 0; k < (bk >> 1); k++)
811 src_b = LD_SP(b + 0);
812 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
813 src_c0 -= src_a0 * src_b0;
814 src_c1 -= src_a0 * src_b1;
815 src_c2 -= src_a0 * src_b2;
816 src_c3 -= src_a0 * src_b3;
818 src_b = LD_SP(b + 4);
819 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
820 src_c4 -= src_a0 * src_b0;
821 src_c5 -= src_a0 * src_b1;
822 src_c6 -= src_a0 * src_b2;
823 src_c7 -= src_a0 * src_b3;
830 src_b = LD_SP(b + 0);
831 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
832 src_c0 -= src_a0 * src_b0;
833 src_c1 -= src_a0 * src_b1;
834 src_c2 -= src_a0 * src_b2;
835 src_c3 -= src_a0 * src_b3;
837 src_b = LD_SP(b + 4);
838 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
839 src_c4 -= src_a0 * src_b0;
840 src_c5 -= src_a0 * src_b1;
841 src_c6 -= src_a0 * src_b2;
842 src_c7 -= src_a0 * src_b3;
848 if ((bk & 1) && (bk > 0))
852 src_b = LD_SP(b + 0);
853 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
854 src_c0 -= src_a0 * src_b0;
855 src_c1 -= src_a0 * src_b1;
856 src_c2 -= src_a0 * src_b2;
857 src_c3 -= src_a0 * src_b3;
859 src_b = LD_SP(b + 4);
860 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
861 src_c4 -= src_a0 * src_b0;
862 src_c5 -= src_a0 * src_b1;
863 src_c6 -= src_a0 * src_b2;
864 src_c7 -= src_a0 * src_b3;
870 TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3,
871 res_c0, res_c1, res_c2, res_c3);
872 TRANSPOSE4x4_SP_SP(src_c4, src_c5, src_c6, src_c7,
873 res_c4, res_c5, res_c6, res_c7);
875 src_a = LD_SP(a + 0);
876 SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3);
877 src_a5 = LD_SP(a + 5);
878 src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2);
879 src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1);
880 src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0);
881 src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10));
882 src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11));
883 src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15));
887 res_c1 -= res_c0 * src_a1;
888 res_c5 -= res_c4 * src_a1;
889 res_c2 -= res_c0 * src_a2;
890 res_c6 -= res_c4 * src_a2;
891 res_c3 -= res_c0 * src_a3;
892 res_c7 -= res_c4 * src_a3;
896 res_c2 -= res_c1 * src_a6;
897 res_c6 -= res_c5 * src_a6;
898 res_c3 -= res_c1 * src_a7;
899 res_c7 -= res_c5 * src_a7;
903 res_c3 -= res_c2 * src_a11;
904 res_c7 -= res_c6 * src_a11;
909 ST_SP4(res_c0, res_c4, res_c1, res_c5, b, 4);
910 ST_SP4(res_c2, res_c6, res_c3, res_c7, b + 16, 4);
912 TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
913 src_c0, src_c1, src_c2, src_c3);
914 TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7,
915 src_c4, src_c5, src_c6, src_c7);
918 ST_SP(src_c1, c_nxt1line);
919 ST_SP(src_c2, c_nxt2line);
920 ST_SP(src_c3, c_nxt3line);
921 ST_SP(src_c4, c_nxt4line);
922 ST_SP(src_c5, c_nxt5line);
923 ST_SP(src_c6, c_nxt6line);
924 ST_SP(src_c7, c_nxt7line);
927 static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
930 v4f32 src_b, src_b0, src_b1, src_b2, src_b3;
931 v4f32 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3;
932 v4f32 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7;
933 v4f32 src_a10, src_a11, src_a15, src_a;
934 FLOAT *c_nxt1line = c + ldc;
935 FLOAT *c_nxt2line = c + 2 * ldc;
936 FLOAT *c_nxt3line = c + 3 * ldc;
939 src_c1 = LD_SP(c_nxt1line);
940 src_c2 = LD_SP(c_nxt2line);
941 src_c3 = LD_SP(c_nxt3line);
943 for (k = 0; k < (bk >> 1); k++)
947 src_b = LD_SP(b + 0);
948 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
949 src_c0 -= src_a0 * src_b0;
950 src_c1 -= src_a0 * src_b1;
951 src_c2 -= src_a0 * src_b2;
952 src_c3 -= src_a0 * src_b3;
959 src_b = LD_SP(b + 0);
960 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
961 src_c0 -= src_a0 * src_b0;
962 src_c1 -= src_a0 * src_b1;
963 src_c2 -= src_a0 * src_b2;
964 src_c3 -= src_a0 * src_b3;
970 if ((bk & 1) && (bk > 0))
974 src_b = LD_SP(b + 0);
975 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
976 src_c0 -= src_a0 * src_b0;
977 src_c1 -= src_a0 * src_b1;
978 src_c2 -= src_a0 * src_b2;
979 src_c3 -= src_a0 * src_b3;
985 TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3,
986 res_c0, res_c1, res_c2, res_c3);
988 src_a = LD_SP(a + 0);
989 SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3);
990 src_a5 = LD_SP(a + 5);
991 src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2);
992 src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1);
993 src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0);
994 src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10));
995 src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11));
996 src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15));
999 res_c1 -= res_c0 * src_a1;
1000 res_c2 -= res_c0 * src_a2;
1001 res_c3 -= res_c0 * src_a3;
1004 res_c2 -= res_c1 * src_a6;
1005 res_c3 -= res_c1 * src_a7;
1008 res_c3 -= res_c2 * src_a11;
1012 ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4);
1014 TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
1015 src_c0, src_c1, src_c2, src_c3);
1018 ST_SP(src_c1, c_nxt1line);
1019 ST_SP(src_c2, c_nxt2line);
1020 ST_SP(src_c3, c_nxt3line);
1023 static void ssolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1026 FLOAT c0, c1, c2, c3, c0_nxt, c1_nxt, c2_nxt, c3_nxt;
1027 FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15;
1033 c0_nxt = *(c + 0 + ldc);
1034 c1_nxt = *(c + 1 + ldc);
1035 c2_nxt = *(c + 2 + ldc);
1036 c3_nxt = *(c + 3 + ldc);
1038 for (k = 0; k < bk; k++)
1044 c0_nxt -= a[0] * b[1];
1045 c1_nxt -= a[1] * b[1];
1046 c2_nxt -= a[2] * b[1];
1047 c3_nxt -= a[3] * b[1];
1068 c1_nxt -= c0_nxt * a1;
1074 c2_nxt -= c0_nxt * a2;
1077 c2_nxt -= c1_nxt * a6;
1083 c3_nxt -= c0_nxt * a3;
1086 c3_nxt -= c1_nxt * a7;
1089 c3_nxt -= c2_nxt * a11;
1107 *(c + 0 + ldc) = c0_nxt;
1108 *(c + 1 + ldc) = c1_nxt;
1109 *(c + 2 + ldc) = c2_nxt;
1110 *(c + 3 + ldc) = c3_nxt;
1113 static void ssolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
1116 FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15, c0, c1, c2, c3;
1123 for (k = 0; k < bk; k++)
1170 static void ssolve_2x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1173 FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2;
1174 FLOAT c0_nxt3, c1_nxt3, c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5;
1175 FLOAT c0_nxt6, c1_nxt6, c0_nxt7, c1_nxt7;
1179 c0_nxt1 = *(c + ldc);
1180 c1_nxt1 = *(c + 1 + ldc);
1181 c0_nxt2 = *(c + 2 * ldc);
1182 c1_nxt2 = *(c + 1 + 2 * ldc);
1183 c0_nxt3 = *(c + 3 * ldc);
1184 c1_nxt3 = *(c + 1 + 3 * ldc);
1185 c0_nxt4 = *(c + 4 * ldc);
1186 c1_nxt4 = *(c + 1 + 4 * ldc);
1187 c0_nxt5 = *(c + 5 * ldc);
1188 c1_nxt5 = *(c + 1 + 5 * ldc);
1189 c0_nxt6 = *(c + 6 * ldc);
1190 c1_nxt6 = *(c + 1 + 6 * ldc);
1191 c0_nxt7 = *(c + 7 * ldc);
1192 c1_nxt7 = *(c + 1 + 7 * ldc);
1194 for (k = 0; k < bk; k++)
1198 c0_nxt1 -= a[0] * b[1];
1199 c1_nxt1 -= a[1] * b[1];
1200 c0_nxt2 -= a[0] * b[2];
1201 c1_nxt2 -= a[1] * b[2];
1202 c0_nxt3 -= a[0] * b[3];
1203 c1_nxt3 -= a[1] * b[3];
1204 c0_nxt4 -= a[0] * b[4];
1205 c1_nxt4 -= a[1] * b[4];
1206 c0_nxt5 -= a[0] * b[5];
1207 c1_nxt5 -= a[1] * b[5];
1208 c0_nxt6 -= a[0] * b[6];
1209 c1_nxt6 -= a[1] * b[6];
1210 c0_nxt7 -= a[0] * b[7];
1211 c1_nxt7 -= a[1] * b[7];
1222 c1 = (c1 - c0 * a1) * a3;
1224 c0_nxt1 = c0_nxt1 * a0;
1225 c1_nxt1 = (c1_nxt1 - c0_nxt1 * a1) * a3;
1227 c0_nxt2 = c0_nxt2 * a0;
1228 c1_nxt2 = (c1_nxt2 - c0_nxt2 * a1) * a3;
1230 c0_nxt3 = c0_nxt3 * a0;
1231 c1_nxt3 = (c1_nxt3 - c0_nxt3 * a1) * a3;
1233 c0_nxt4 = c0_nxt4 * a0;
1234 c1_nxt4 = (c1_nxt4 - c0_nxt4 * a1) * a3;
1236 c0_nxt5 = c0_nxt5 * a0;
1237 c1_nxt5 = (c1_nxt5 - c0_nxt5 * a1) * a3;
1239 c0_nxt6 = c0_nxt6 * a0;
1240 c1_nxt6 = (c1_nxt6 - c0_nxt6 * a1) * a3;
1242 c0_nxt7 = c0_nxt7 * a0;
1243 c1_nxt7 = (c1_nxt7 - c0_nxt7 * a1) * a3;
1255 *(b + 10) = c1_nxt2;
1256 *(b + 11) = c1_nxt3;
1257 *(b + 12) = c1_nxt4;
1258 *(b + 13) = c1_nxt5;
1259 *(b + 14) = c1_nxt6;
1260 *(b + 15) = c1_nxt7;
1264 *(c + 0 + ldc) = c0_nxt1;
1265 *(c + 1 + ldc) = c1_nxt1;
1266 *(c + 0 + 2 * ldc) = c0_nxt2;
1267 *(c + 1 + 2 * ldc) = c1_nxt2;
1268 *(c + 0 + 3 * ldc) = c0_nxt3;
1269 *(c + 1 + 3 * ldc) = c1_nxt3;
1270 *(c + 0 + 4 * ldc) = c0_nxt4;
1271 *(c + 1 + 4 * ldc) = c1_nxt4;
1272 *(c + 0 + 5 * ldc) = c0_nxt5;
1273 *(c + 1 + 5 * ldc) = c1_nxt5;
1274 *(c + 0 + 6 * ldc) = c0_nxt6;
1275 *(c + 1 + 6 * ldc) = c1_nxt6;
1276 *(c + 0 + 7 * ldc) = c0_nxt7;
1277 *(c + 1 + 7 * ldc) = c1_nxt7;
1280 static void ssolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1283 FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1;
1284 FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
1288 c0_nxt1 = *(c + ldc);
1289 c1_nxt1 = *(c + 1 + ldc);
1290 c0_nxt2 = *(c + 2 * ldc);
1291 c1_nxt2 = *(c + 1 + 2 * ldc);
1292 c0_nxt3 = *(c + 3 * ldc);
1293 c1_nxt3 = *(c + 1 + 3 * ldc);
1295 for (k = 0; k < bk; k++)
1299 c0_nxt1 -= a[0] * b[1];
1300 c1_nxt1 -= a[1] * b[1];
1301 c0_nxt2 -= a[0] * b[2];
1302 c1_nxt2 -= a[1] * b[2];
1303 c0_nxt3 -= a[0] * b[3];
1304 c1_nxt3 -= a[1] * b[3];
1320 c1_nxt1 -= c0_nxt1 * a1;
1321 c1_nxt2 -= c0_nxt2 * a1;
1322 c1_nxt3 -= c0_nxt3 * a1;
1339 *(c + 0 + ldc) = c0_nxt1;
1340 *(c + 1 + ldc) = c1_nxt1;
1341 *(c + 0 + 2 * ldc) = c0_nxt2;
1342 *(c + 1 + 2 * ldc) = c1_nxt2;
1343 *(c + 0 + 3 * ldc) = c0_nxt3;
1344 *(c + 1 + 3 * ldc) = c1_nxt3;
1347 static void ssolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1350 FLOAT a0, a1, a3, c0, c1, c0_nxt, c1_nxt;
1354 c0_nxt = *(c + ldc);
1355 c1_nxt = *(c + 1 + ldc);
1357 for (k = 0; k < bk; k++)
1361 c0_nxt -= a[0] * b[1];
1362 c1_nxt -= a[1] * b[1];
1375 c1_nxt -= c0_nxt * a1;
1386 *(c + 0 + ldc) = c0_nxt;
1387 *(c + 1 + ldc) = c1_nxt;
1390 static void ssolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
1398 for (k = 0; k < bk; k++)
1409 c1 -= c0 * *(a + 1);
1419 static void ssolve_1x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1422 FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
1425 c1 = *(c + 1 * ldc);
1426 c2 = *(c + 2 * ldc);
1427 c3 = *(c + 3 * ldc);
1428 c4 = *(c + 4 * ldc);
1429 c5 = *(c + 5 * ldc);
1430 c6 = *(c + 6 * ldc);
1431 c7 = *(c + 7 * ldc);
1433 for (k = 0; k < bk; k++)
1467 *(c + 1 * ldc) = c1;
1468 *(c + 2 * ldc) = c2;
1469 *(c + 3 * ldc) = c3;
1470 *(c + 4 * ldc) = c4;
1471 *(c + 5 * ldc) = c5;
1472 *(c + 6 * ldc) = c6;
1473 *(c + 7 * ldc) = c7;
1476 static void ssolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1479 FLOAT c0, c1, c2, c3;
1481 c0 = *(c + 0 * ldc);
1482 c1 = *(c + 1 * ldc);
1483 c2 = *(c + 2 * ldc);
1484 c3 = *(c + 3 * ldc);
1486 for (k = 0; k < bk; k++)
1504 *(c + 2 * ldc) = c2;
1505 *(c + 3 * ldc) = c3;
1508 *(b + 1) = *(c + ldc);
1509 *(b + 2) = *(c + 2 * ldc);
1510 *(b + 3) = *(c + 3 * ldc);
1513 static void ssolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1521 for (k = 0; k < bk; k++)
1531 *(c + ldc) = c1 * *a;
1534 *(b + 1) = *(c + ldc);
1537 static void ssolve_1x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
1541 for (k = 0; k < bk; k++)
1553 int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
1554 FLOAT *c, BLASLONG ldc, BLASLONG offset)
1559 for (j = (n >> 3); j--;)
1565 for (i = (m >> 3); i--;)
1567 ssolve_8x8_lt_msa(aa, b, cc, ldc, kk);
1578 ssolve_4x8_lt_msa(aa, b, cc, ldc, kk);
1587 ssolve_2x8_lt_msa(aa, b, cc, ldc, kk);
1596 ssolve_1x8_lt_msa(aa, b, cc, ldc, kk);
1616 for (i = (m >> 3); i--;)
1618 ssolve_8x4_lt_msa(aa, b, cc, ldc, kk);
1629 ssolve_4x4_lt_msa(aa, b, cc, ldc, kk);
1638 ssolve_2x4_lt_msa(aa, b, cc, ldc, kk);
1647 ssolve_1x4_lt_msa(aa, b, cc, ldc, kk);
1665 for (i = (m >> 3); i--;)
1667 ssolve_8x2_lt_msa(aa, b, cc, ldc, kk);
1678 ssolve_4x2_lt_msa(aa, b, cc, ldc, kk);
1687 ssolve_2x2_lt_msa(aa, b, cc, ldc, kk);
1696 ssolve_1x2_lt_msa(aa, b, cc, ldc, kk);
1714 for (i = (m >> 3); i--;)
1716 ssolve_8x1_lt_msa(aa, b, cc, kk);
1727 ssolve_4x1_lt_msa(aa, b, cc, kk);
1736 ssolve_2x1_lt_msa(aa, b, cc, kk);
1745 ssolve_1x1_lt_msa(aa, b, cc, kk);