1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
33 v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
34 v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
35 v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
36 v4f32 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15;
37 v4f32 src_a, src_a0, src_a8, src_a9, src_a16, src_a17, src_a18, src_a24;
38 v4f32 src_a25, src_a26, src_a27, src_a32, src_a33, src_a34, src_a35, src_a36;
39 v4f32 src_a40, src_a41, src_a42, src_a43, src_a44, src_a45;
40 v4f32 src_a48, src_a49, src_a50, src_a51, src_a52, src_a53, src_a54;
41 v4f32 src_a56, src_a57, src_a58, src_a59, src_a60, src_a61, src_a62, src_a63;
42 FLOAT *c_nxt1line = c + ldc;
43 FLOAT *c_nxt2line = c + 2 * ldc;
44 FLOAT *c_nxt3line = c + 3 * ldc;
45 FLOAT *c_nxt4line = c + 4 * ldc;
46 FLOAT *c_nxt5line = c + 5 * ldc;
47 FLOAT *c_nxt6line = c + 6 * ldc;
48 FLOAT *c_nxt7line = c + 7 * ldc;
50 LD_SP2(c, 4, src_c0, src_c1);
51 LD_SP2(c_nxt1line, 4, src_c2, src_c3);
52 LD_SP2(c_nxt2line, 4, src_c4, src_c5);
53 LD_SP2(c_nxt3line, 4, src_c6, src_c7);
54 LD_SP2(c_nxt4line, 4, src_c8, src_c9);
55 LD_SP2(c_nxt5line, 4, src_c10, src_c11);
56 LD_SP2(c_nxt6line, 4, src_c12, src_c13);
57 LD_SP2(c_nxt7line, 4, src_c14, src_c15);
61 BLASLONG k, pref_offset;
62 FLOAT *aa = a, *bb = b, *pa0_pref;
63 v4f32 src_a1, src_b0, src_b1, src_b2, src_b3, src_bb0, src_bb1;
65 pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1);
69 pref_offset = L1_DATA_LINESIZE - pref_offset;
70 pref_offset = pref_offset / sizeof(FLOAT);
73 pa0_pref = a + pref_offset;
75 for (k = 0; k < (bk >> 1); k++)
77 PREF_OFFSET(pa0_pref, 64);
78 PREF_OFFSET(pa0_pref, 96);
80 LD_SP2_INC(aa, 4, src_a0, src_a1);
81 LD_SP2_INC(bb, 4, src_bb0, src_bb1);
83 SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
84 src_c0 -= src_a0 * src_b0;
85 src_c1 -= src_a1 * src_b0;
86 src_c2 -= src_a0 * src_b1;
87 src_c3 -= src_a1 * src_b1;
88 src_c4 -= src_a0 * src_b2;
89 src_c5 -= src_a1 * src_b2;
90 src_c6 -= src_a0 * src_b3;
91 src_c7 -= src_a1 * src_b3;
93 SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
94 src_c8 -= src_a0 * src_b0;
95 src_c9 -= src_a1 * src_b0;
96 src_c10 -= src_a0 * src_b1;
97 src_c11 -= src_a1 * src_b1;
98 src_c12 -= src_a0 * src_b2;
99 src_c13 -= src_a1 * src_b2;
100 src_c14 -= src_a0 * src_b3;
101 src_c15 -= src_a1 * src_b3;
103 LD_SP2_INC(aa, 4, src_a0, src_a1);
104 LD_SP2_INC(bb, 4, src_bb0, src_bb1);
106 SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
107 src_c0 -= src_a0 * src_b0;
108 src_c1 -= src_a1 * src_b0;
109 src_c2 -= src_a0 * src_b1;
110 src_c3 -= src_a1 * src_b1;
111 src_c4 -= src_a0 * src_b2;
112 src_c5 -= src_a1 * src_b2;
113 src_c6 -= src_a0 * src_b3;
114 src_c7 -= src_a1 * src_b3;
116 SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
117 src_c8 -= src_a0 * src_b0;
118 src_c9 -= src_a1 * src_b0;
119 src_c10 -= src_a0 * src_b1;
120 src_c11 -= src_a1 * src_b1;
121 src_c12 -= src_a0 * src_b2;
122 src_c13 -= src_a1 * src_b2;
123 src_c14 -= src_a0 * src_b3;
124 src_c15 -= src_a1 * src_b3;
131 LD_SP2_INC(aa, 4, src_a0, src_a1);
132 LD_SP2_INC(bb, 4, src_bb0, src_bb1);
134 SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
135 src_c0 -= src_a0 * src_b0;
136 src_c1 -= src_a1 * src_b0;
137 src_c2 -= src_a0 * src_b1;
138 src_c3 -= src_a1 * src_b1;
139 src_c4 -= src_a0 * src_b2;
140 src_c5 -= src_a1 * src_b2;
141 src_c6 -= src_a0 * src_b3;
142 src_c7 -= src_a1 * src_b3;
144 SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
145 src_c8 -= src_a0 * src_b0;
146 src_c9 -= src_a1 * src_b0;
147 src_c10 -= src_a0 * src_b1;
148 src_c11 -= src_a1 * src_b1;
149 src_c12 -= src_a0 * src_b2;
150 src_c13 -= src_a1 * src_b2;
151 src_c14 -= src_a0 * src_b3;
152 src_c15 -= src_a1 * src_b3;
159 TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7,
160 res_c4, res_c5, res_c6, res_c7);
161 TRANSPOSE4x4_SP_SP(src_c9, src_c11, src_c13, src_c15,
162 res_c12, res_c13, res_c14, res_c15);
163 TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6,
164 res_c0, res_c1, res_c2, res_c3);
165 TRANSPOSE4x4_SP_SP(src_c8, src_c10, src_c12, src_c14,
166 res_c8, res_c9, res_c10, res_c11);
168 src_a = LD_SP(a + 60);
169 SPLATI_W4_SP(src_a, src_a60, src_a61, src_a62, src_a63);
170 src_a = LD_SP(a + 56);
171 SPLATI_W4_SP(src_a, src_a56, src_a57, src_a58, src_a59);
175 res_c6 -= res_c7 * src_a62;
176 res_c14 -= res_c15 * src_a62;
177 res_c5 -= res_c7 * src_a61;
178 res_c13 -= res_c15 * src_a61;
179 res_c4 -= res_c7 * src_a60;
180 res_c12 -= res_c15 * src_a60;
181 res_c3 -= res_c7 * src_a59;
182 res_c11 -= res_c15 * src_a59;
183 res_c2 -= res_c7 * src_a58;
184 res_c10 -= res_c15 * src_a58;
185 res_c1 -= res_c7 * src_a57;
186 res_c9 -= res_c15 * src_a57;
187 res_c0 -= res_c7 * src_a56;
188 res_c8 -= res_c15 * src_a56;
190 src_a = LD_SP(a + 48);
191 SPLATI_W4_SP(src_a, src_a48, src_a49, src_a50, src_a51);
192 src_a52 = LD_SP(a + 52);
193 src_a54 = (v4f32) __msa_splati_w((v4i32) src_a52, 2);
194 src_a53 = (v4f32) __msa_splati_w((v4i32) src_a52, 1);
195 src_a52 = (v4f32) __msa_splati_w((v4i32) src_a52, 0);
199 res_c5 -= res_c6 * src_a53;
200 res_c13 -= res_c14 * src_a53;
201 res_c4 -= res_c6 * src_a52;
202 res_c12 -= res_c14 * src_a52;
203 res_c3 -= res_c6 * src_a51;
204 res_c11 -= res_c14 * src_a51;
205 res_c2 -= res_c6 * src_a50;
206 res_c10 -= res_c14 * src_a50;
207 res_c1 -= res_c6 * src_a49;
208 res_c9 -= res_c14 * src_a49;
209 res_c0 -= res_c6 * src_a48;
210 res_c8 -= res_c14 * src_a48;
212 src_a = LD_SP(a + 40);
213 SPLATI_W4_SP(src_a, src_a40, src_a41, src_a42, src_a43);
214 src_a44 = LD_SP(a + 44);
215 src_a45 = (v4f32) __msa_splati_w((v4i32) src_a44, 1);
216 src_a44 = (v4f32) __msa_splati_w((v4i32) src_a44, 0);
220 res_c4 -= res_c5 * src_a44;
221 res_c12 -= res_c13 * src_a44;
222 res_c3 -= res_c5 * src_a43;
223 res_c11 -= res_c13 * src_a43;
224 res_c2 -= res_c5 * src_a42;
225 res_c10 -= res_c13 * src_a42;
226 res_c1 -= res_c5 * src_a41;
227 res_c9 -= res_c13 * src_a41;
228 res_c0 -= res_c5 * src_a40;
229 res_c8 -= res_c13 * src_a40;
231 src_a = LD_SP(a + 32);
232 SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35);
233 src_a36 = COPY_FLOAT_TO_VECTOR(*(a + 36));
237 res_c3 -= res_c4 * src_a35;
238 res_c11 -= res_c12 * src_a35;
239 res_c2 -= res_c4 * src_a34;
240 res_c10 -= res_c12 * src_a34;
241 res_c1 -= res_c4 * src_a33;
242 res_c9 -= res_c12 * src_a33;
243 res_c0 -= res_c4 * src_a32;
244 res_c8 -= res_c12 * src_a32;
246 ST_SP4(res_c4, res_c12, res_c5, res_c13, b + 32, 4);
247 ST_SP4(res_c6, res_c14, res_c7, res_c15, b + 48, 4);
249 TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7,
250 src_c1, src_c3, src_c5, src_c7);
251 TRANSPOSE4x4_SP_SP(res_c12, res_c13, res_c14, res_c15,
252 src_c9, src_c11, src_c13, src_c15);
254 ST_SP(src_c1, c + 4);
255 ST_SP(src_c3, c_nxt1line + 4);
256 ST_SP(src_c5, c_nxt2line + 4);
257 ST_SP(src_c7, c_nxt3line + 4);
258 ST_SP(src_c9, c_nxt4line + 4);
259 ST_SP(src_c11, c_nxt5line + 4);
260 ST_SP(src_c13, c_nxt6line + 4);
261 ST_SP(src_c15, c_nxt7line + 4);
263 src_a = LD_SP(a + 24);
264 SPLATI_W4_SP(src_a, src_a24, src_a25, src_a26, src_a27);
268 res_c2 -= res_c3 * src_a26;
269 res_c10 -= res_c11 * src_a26;
270 res_c1 -= res_c3 * src_a25;
271 res_c9 -= res_c11 * src_a25;
272 res_c0 -= res_c3 * src_a24;
273 res_c8 -= res_c11 * src_a24;
275 src_a16 = LD_SP(a + 16);
276 src_a18 = (v4f32) __msa_splati_w((v4i32) src_a16, 2);
277 src_a17 = (v4f32) __msa_splati_w((v4i32) src_a16, 1);
278 src_a16 = (v4f32) __msa_splati_w((v4i32) src_a16, 0);
282 res_c1 -= res_c2 * src_a17;
283 res_c9 -= res_c10 * src_a17;
284 res_c0 -= res_c2 * src_a16;
285 res_c8 -= res_c10 * src_a16;
287 src_a9 = COPY_FLOAT_TO_VECTOR(*(a + 9));
288 src_a8 = COPY_FLOAT_TO_VECTOR(*(a + 8));
289 src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0));
293 res_c0 -= res_c1 * src_a8;
294 res_c8 -= res_c9 * src_a8;
299 ST_SP4(res_c0, res_c8, res_c1, res_c9, b, 4);
300 ST_SP4(res_c2, res_c10, res_c3, res_c11, b + 16, 4);
302 TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
303 src_c0, src_c2, src_c4, src_c6);
304 TRANSPOSE4x4_SP_SP(res_c8, res_c9, res_c10, res_c11,
305 src_c8, src_c10, src_c12, src_c14);
308 ST_SP(src_c2, c_nxt1line);
309 ST_SP(src_c4, c_nxt2line);
310 ST_SP(src_c6, c_nxt3line);
311 ST_SP(src_c8, c_nxt4line);
312 ST_SP(src_c10, c_nxt5line);
313 ST_SP(src_c12, c_nxt6line);
314 ST_SP(src_c14, c_nxt7line);
317 static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
320 FLOAT *aa = a, *bb = b;
321 v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1;
322 v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
323 v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
324 v4f32 src_a, src_a0, src_a8, src_a9, src_a16, src_a17, src_a18, src_a24;
325 v4f32 src_a25, src_a26, src_a27, src_a32, src_a33, src_a34, src_a35;
326 v4f32 src_a36, src_a40, src_a41, src_a42, src_a43, src_a44, src_a45;
327 v4f32 src_a48, src_a49, src_a50, src_a51, src_a52, src_a53, src_a54;
328 v4f32 src_a56, src_a57, src_a58, src_a59, src_a60, src_a61, src_a62, src_a63;
329 FLOAT *c_nxt1line = c + ldc;
330 FLOAT *c_nxt2line = c + 2 * ldc;
331 FLOAT *c_nxt3line = c + 3 * ldc;
333 LD_SP2(c, 4, src_c0, src_c1);
334 LD_SP2(c_nxt1line, 4, src_c2, src_c3);
335 LD_SP2(c_nxt2line, 4, src_c4, src_c5);
336 LD_SP2(c_nxt3line, 4, src_c6, src_c7);
338 for (k = 0; k < (bk >> 1); k++)
340 LD_SP2(aa, 4, src_a0, src_a1);
342 src_b = LD_SP(bb + 0);
343 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
344 src_c0 -= src_a0 * src_b0;
345 src_c1 -= src_a1 * src_b0;
346 src_c2 -= src_a0 * src_b1;
347 src_c3 -= src_a1 * src_b1;
348 src_c4 -= src_a0 * src_b2;
349 src_c5 -= src_a1 * src_b2;
350 src_c6 -= src_a0 * src_b3;
351 src_c7 -= src_a1 * src_b3;
356 LD_SP2(aa, 4, src_a0, src_a1);
358 src_b = LD_SP(bb + 0);
359 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
360 src_c0 -= src_a0 * src_b0;
361 src_c1 -= src_a1 * src_b0;
362 src_c2 -= src_a0 * src_b1;
363 src_c3 -= src_a1 * src_b1;
364 src_c4 -= src_a0 * src_b2;
365 src_c5 -= src_a1 * src_b2;
366 src_c6 -= src_a0 * src_b3;
367 src_c7 -= src_a1 * src_b3;
373 if ((bk & 1) && (bk > 0))
375 LD_SP2(aa, 4, src_a0, src_a1);
377 src_b = LD_SP(bb + 0);
378 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
379 src_c0 -= src_a0 * src_b0;
380 src_c1 -= src_a1 * src_b0;
381 src_c2 -= src_a0 * src_b1;
382 src_c3 -= src_a1 * src_b1;
383 src_c4 -= src_a0 * src_b2;
384 src_c5 -= src_a1 * src_b2;
385 src_c6 -= src_a0 * src_b3;
386 src_c7 -= src_a1 * src_b3;
392 TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6,
393 res_c0, res_c1, res_c2, res_c3);
394 TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7,
395 res_c4, res_c5, res_c6, res_c7);
397 src_a = LD_SP(a + 60);
398 SPLATI_W4_SP(src_a, src_a60, src_a61, src_a62, src_a63);
399 src_a = LD_SP(a + 56);
400 SPLATI_W4_SP(src_a, src_a56, src_a57, src_a58, src_a59);
402 src_a = LD_SP(a + 48);
403 SPLATI_W4_SP(src_a, src_a48, src_a49, src_a50, src_a51);
404 src_a52 = LD_SP(a + 52);
405 src_a54 = (v4f32) __msa_splati_w((v4i32) src_a52, 2);
406 src_a53 = (v4f32) __msa_splati_w((v4i32) src_a52, 1);
407 src_a52 = (v4f32) __msa_splati_w((v4i32) src_a52, 0);
410 res_c6 -= res_c7 * src_a62;
411 res_c5 -= res_c7 * src_a61;
412 res_c4 -= res_c7 * src_a60;
413 res_c3 -= res_c7 * src_a59;
414 res_c2 -= res_c7 * src_a58;
415 res_c1 -= res_c7 * src_a57;
416 res_c0 -= res_c7 * src_a56;
419 res_c5 -= res_c6 * src_a53;
420 res_c4 -= res_c6 * src_a52;
421 res_c3 -= res_c6 * src_a51;
422 res_c2 -= res_c6 * src_a50;
423 res_c1 -= res_c6 * src_a49;
424 res_c0 -= res_c6 * src_a48;
426 src_a = LD_SP(a + 40);
427 SPLATI_W4_SP(src_a, src_a40, src_a41, src_a42, src_a43);
428 src_a44 = LD_SP(a + 44);
429 src_a45 = (v4f32) __msa_splati_w((v4i32) src_a44, 1);
430 src_a44 = (v4f32) __msa_splati_w((v4i32) src_a44, 0);
433 res_c4 -= res_c5 * src_a44;
434 res_c3 -= res_c5 * src_a43;
435 res_c2 -= res_c5 * src_a42;
436 res_c1 -= res_c5 * src_a41;
437 res_c0 -= res_c5 * src_a40;
439 src_a = LD_SP(a + 32);
440 SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35);
441 src_a36 = COPY_FLOAT_TO_VECTOR(*(a + 36));
444 res_c3 -= res_c4 * src_a35;
445 res_c2 -= res_c4 * src_a34;
446 res_c1 -= res_c4 * src_a33;
447 res_c0 -= res_c4 * src_a32;
449 src_a = LD_SP(a + 24);
450 SPLATI_W4_SP(src_a, src_a24, src_a25, src_a26, src_a27);
453 res_c2 -= res_c3 * src_a26;
454 res_c1 -= res_c3 * src_a25;
455 res_c0 -= res_c3 * src_a24;
457 src_a16 = LD_SP(a + 16);
458 src_a18 = (v4f32) __msa_splati_w((v4i32) src_a16, 2);
459 src_a17 = (v4f32) __msa_splati_w((v4i32) src_a16, 1);
460 src_a16 = (v4f32) __msa_splati_w((v4i32) src_a16, 0);
463 res_c1 -= res_c2 * src_a17;
464 res_c0 -= res_c2 * src_a16;
466 src_a9 = COPY_FLOAT_TO_VECTOR(*(a + 9));
467 src_a8 = COPY_FLOAT_TO_VECTOR(*(a + 8));
468 src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0));
471 res_c0 -= res_c1 * src_a8;
475 ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4);
476 ST_SP4(res_c4, res_c5, res_c6, res_c7, b + 16, 4);
478 TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
479 src_c0, src_c2, src_c4, src_c6);
480 TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7,
481 src_c1, src_c3, src_c5, src_c7);
483 ST_SP2(src_c0, src_c1, c, 4);
484 ST_SP2(src_c2, src_c3, c_nxt1line, 4);
485 ST_SP2(src_c4, src_c5, c_nxt2line, 4);
486 ST_SP2(src_c6, src_c7, c_nxt3line, 4);
489 static void ssolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
492 FLOAT *aa = a, *bb = b;
493 FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35;
494 FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53;
495 FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63;
496 FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
497 FLOAT c0_nxt, c1_nxt, c2_nxt, c3_nxt, c4_nxt, c5_nxt, c6_nxt, c7_nxt;
507 c0_nxt = *(c + 0 + ldc);
508 c1_nxt = *(c + 1 + ldc);
509 c2_nxt = *(c + 2 + ldc);
510 c3_nxt = *(c + 3 + ldc);
511 c4_nxt = *(c + 4 + ldc);
512 c5_nxt = *(c + 5 + ldc);
513 c6_nxt = *(c + 6 + ldc);
514 c7_nxt = *(c + 7 + ldc);
516 for (k = 0; k < bk; k++)
526 c0_nxt -= aa[0] * bb[1];
527 c1_nxt -= aa[1] * bb[1];
528 c2_nxt -= aa[2] * bb[1];
529 c3_nxt -= aa[3] * bb[1];
530 c4_nxt -= aa[4] * bb[1];
531 c5_nxt -= aa[5] * bb[1];
532 c6_nxt -= aa[6] * bb[1];
533 c7_nxt -= aa[7] * bb[1];
582 c6_nxt -= c7_nxt * a62;
584 c5_nxt -= c7_nxt * a61;
586 c4_nxt -= c7_nxt * a60;
588 c3_nxt -= c7_nxt * a59;
590 c2_nxt -= c7_nxt * a58;
592 c1_nxt -= c7_nxt * a57;
594 c0_nxt -= c7_nxt * a56;
599 c5_nxt -= c6_nxt * a53;
601 c4_nxt -= c6_nxt * a52;
603 c3_nxt -= c6_nxt * a51;
605 c2_nxt -= c6_nxt * a50;
607 c1_nxt -= c6_nxt * a49;
609 c0_nxt -= c6_nxt * a48;
614 c4_nxt -= c5_nxt * a44;
616 c3_nxt -= c5_nxt * a43;
618 c2_nxt -= c5_nxt * a42;
620 c1_nxt -= c5_nxt * a41;
622 c0_nxt -= c5_nxt * a40;
627 c3_nxt -= c4_nxt * a35;
629 c2_nxt -= c4_nxt * a34;
631 c1_nxt -= c4_nxt * a33;
633 c0_nxt -= c4_nxt * a32;
638 c2_nxt -= c3_nxt * a26;
640 c1_nxt -= c3_nxt * a25;
642 c0_nxt -= c3_nxt * a24;
647 c1_nxt -= c2_nxt * a17;
649 c0_nxt -= c2_nxt * a16;
654 c0_nxt -= c1_nxt * a8;
684 *(c + 0 + ldc) = c0_nxt;
685 *(c + 1 + ldc) = c1_nxt;
686 *(c + 2 + ldc) = c2_nxt;
687 *(c + 3 + ldc) = c3_nxt;
688 *(c + 4 + ldc) = c4_nxt;
689 *(c + 5 + ldc) = c5_nxt;
690 *(c + 6 + ldc) = c6_nxt;
691 *(c + 7 + ldc) = c7_nxt;
694 static void ssolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
697 FLOAT *aa = a, *bb = b;
698 FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35;
699 FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53;
700 FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63;
701 FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
712 for (k = 0; k < bk; k++)
830 static void ssolve_4x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
833 FLOAT *aa = a, *bb = b;
834 v4f32 src_b, src_b0, src_b1, src_b2, src_b3;
835 v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
836 v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
837 v4f32 src_a, src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12;
838 v4f32 src_a13, src_a14, src_a15;
839 FLOAT *c_nxt1line = c + ldc;
840 FLOAT *c_nxt2line = c + 2 * ldc;
841 FLOAT *c_nxt3line = c + 3 * ldc;
842 FLOAT *c_nxt4line = c + 4 * ldc;
843 FLOAT *c_nxt5line = c + 5 * ldc;
844 FLOAT *c_nxt6line = c + 6 * ldc;
845 FLOAT *c_nxt7line = c + 7 * ldc;
848 src_c1 = LD_SP(c_nxt1line);
849 src_c2 = LD_SP(c_nxt2line);
850 src_c3 = LD_SP(c_nxt3line);
851 src_c4 = LD_SP(c_nxt4line);
852 src_c5 = LD_SP(c_nxt5line);
853 src_c6 = LD_SP(c_nxt6line);
854 src_c7 = LD_SP(c_nxt7line);
856 for (k = 0; k < bk; k++)
861 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
862 src_c0 -= src_a0 * src_b0;
863 src_c1 -= src_a0 * src_b1;
864 src_c2 -= src_a0 * src_b2;
865 src_c3 -= src_a0 * src_b3;
867 src_b = LD_SP(bb + 4);
868 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
869 src_c4 -= src_a0 * src_b0;
870 src_c5 -= src_a0 * src_b1;
871 src_c6 -= src_a0 * src_b2;
872 src_c7 -= src_a0 * src_b3;
881 TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3,
882 res_c0, res_c1, res_c2, res_c3);
883 TRANSPOSE4x4_SP_SP(src_c4, src_c5, src_c6, src_c7,
884 res_c4, res_c5, res_c6, res_c7);
886 src_a = LD_SP(a + 12);
887 SPLATI_W4_SP(src_a, src_a12, src_a13, src_a14, src_a15);
888 src_a8 = LD_SP(a + 8);
889 src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2);
890 src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1);
891 src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0);
893 src_a5 = COPY_FLOAT_TO_VECTOR(*(a + 5));
894 src_a4 = COPY_FLOAT_TO_VECTOR(*(a + 4));
895 src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0));
899 res_c2 -= res_c3 * src_a14;
900 res_c6 -= res_c7 * src_a14;
901 res_c1 -= res_c3 * src_a13;
902 res_c5 -= res_c7 * src_a13;
903 res_c0 -= res_c3 * src_a12;
904 res_c4 -= res_c7 * src_a12;
908 res_c1 -= res_c2 * src_a9;
909 res_c5 -= res_c6 * src_a9;
910 res_c0 -= res_c2 * src_a8;
911 res_c4 -= res_c6 * src_a8;
915 res_c0 -= res_c1 * src_a4;
916 res_c4 -= res_c5 * src_a4;
921 ST_SP4(res_c0, res_c4, res_c1, res_c5, b, 4);
922 ST_SP4(res_c2, res_c6, res_c3, res_c7, b + 16, 4);
924 TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
925 src_c0, src_c1, src_c2, src_c3);
926 TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7,
927 src_c4, src_c5, src_c6, src_c7);
930 ST_SP(src_c1, c_nxt1line);
931 ST_SP(src_c2, c_nxt2line);
932 ST_SP(src_c3, c_nxt3line);
933 ST_SP(src_c4, c_nxt4line);
934 ST_SP(src_c5, c_nxt5line);
935 ST_SP(src_c6, c_nxt6line);
936 ST_SP(src_c7, c_nxt7line);
939 static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
942 FLOAT *aa = a, *bb = b;
943 v4f32 src_b, src_b0, src_b1, src_b2, src_b3;
944 v4f32 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3;
945 v4f32 src_a, src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12;
946 v4f32 src_a13, src_a14, src_a15;
947 FLOAT *c_nxt1line = c + ldc;
948 FLOAT *c_nxt2line = c + 2 * ldc;
949 FLOAT *c_nxt3line = c + 3 * ldc;
952 src_c1 = LD_SP(c_nxt1line);
953 src_c2 = LD_SP(c_nxt2line);
954 src_c3 = LD_SP(c_nxt3line);
956 for (k = 0; k < (bk >> 1); k++)
961 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
962 src_c0 -= src_a0 * src_b0;
963 src_c1 -= src_a0 * src_b1;
964 src_c2 -= src_a0 * src_b2;
965 src_c3 -= src_a0 * src_b3;
973 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
974 src_c0 -= src_a0 * src_b0;
975 src_c1 -= src_a0 * src_b1;
976 src_c2 -= src_a0 * src_b2;
977 src_c3 -= src_a0 * src_b3;
983 if ((bk & 1) && (bk > 0))
988 SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
989 src_c0 -= src_a0 * src_b0;
990 src_c1 -= src_a0 * src_b1;
991 src_c2 -= src_a0 * src_b2;
992 src_c3 -= src_a0 * src_b3;
998 TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3,
999 res_c0, res_c1, res_c2, res_c3);
1001 src_a = LD_SP(a + 12);
1002 SPLATI_W4_SP(src_a, src_a12, src_a13, src_a14, src_a15);
1003 src_a8 = LD_SP(a + 8);
1004 src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2);
1005 src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1);
1006 src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0);
1007 src_a5 = COPY_FLOAT_TO_VECTOR(*(a + 5));
1008 src_a4 = COPY_FLOAT_TO_VECTOR(*(a + 4));
1009 src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0));
1012 res_c2 -= res_c3 * src_a14;
1013 res_c1 -= res_c3 * src_a13;
1014 res_c0 -= res_c3 * src_a12;
1017 res_c1 -= res_c2 * src_a9;
1018 res_c0 -= res_c2 * src_a8;
1021 res_c0 -= res_c1 * src_a4;
1025 ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4);
1027 TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
1028 src_c0, src_c1, src_c2, src_c3);
1031 ST_SP(src_c1, c_nxt1line);
1032 ST_SP(src_c2, c_nxt2line);
1033 ST_SP(src_c3, c_nxt3line);
1036 static void ssolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1039 FLOAT *aa = a, *bb = b;
1040 FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15;
1041 FLOAT c0, c1, c2, c3, c0_nxt, c1_nxt, c2_nxt, c3_nxt;
1047 c0_nxt = *(c + 0 + ldc);
1048 c1_nxt = *(c + 1 + ldc);
1049 c2_nxt = *(c + 2 + ldc);
1050 c3_nxt = *(c + 3 + ldc);
1052 for (k = 0; k < bk; k++)
1054 c0 -= aa[0] * bb[0];
1055 c1 -= aa[1] * bb[0];
1056 c2 -= aa[2] * bb[0];
1057 c3 -= aa[3] * bb[0];
1058 c0_nxt -= aa[0] * bb[1];
1059 c1_nxt -= aa[1] * bb[1];
1060 c2_nxt -= aa[2] * bb[1];
1061 c3_nxt -= aa[3] * bb[1];
1085 c2_nxt -= c3_nxt * a14;
1091 c1_nxt -= c3_nxt * a13;
1094 c1_nxt -= c2_nxt * a9;
1100 c0_nxt -= c3_nxt * a12;
1103 c0_nxt -= c2_nxt * a8;
1106 c0_nxt -= c1_nxt * a4;
1124 *(c + 0 + ldc) = c0_nxt;
1125 *(c + 1 + ldc) = c1_nxt;
1126 *(c + 2 + ldc) = c2_nxt;
1127 *(c + 3 + ldc) = c3_nxt;
1130 static void ssolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
1133 FLOAT *aa = a, *bb = b;
1134 FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15, c0, c1, c2, c3;
1141 for (k = 0; k < bk; k++)
1143 c0 -= aa[0] * bb[0];
1144 c1 -= aa[1] * bb[0];
1145 c2 -= aa[2] * bb[0];
1146 c3 -= aa[3] * bb[0];
1191 static void ssolve_2x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1194 FLOAT *aa = a, *bb = b;
1195 FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3;
1196 FLOAT c1_nxt3, c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6;
1197 FLOAT c0_nxt7, c1_nxt7;
1201 c0_nxt1 = *(c + 0 + 1 * ldc);
1202 c1_nxt1 = *(c + 1 + 1 * ldc);
1203 c0_nxt2 = *(c + 0 + 2 * ldc);
1204 c1_nxt2 = *(c + 1 + 2 * ldc);
1205 c0_nxt3 = *(c + 0 + 3 * ldc);
1206 c1_nxt3 = *(c + 1 + 3 * ldc);
1207 c0_nxt4 = *(c + 0 + 4 * ldc);
1208 c1_nxt4 = *(c + 1 + 4 * ldc);
1209 c0_nxt5 = *(c + 0 + 5 * ldc);
1210 c1_nxt5 = *(c + 1 + 5 * ldc);
1211 c0_nxt6 = *(c + 0 + 6 * ldc);
1212 c1_nxt6 = *(c + 1 + 6 * ldc);
1213 c0_nxt7 = *(c + 0 + 7 * ldc);
1214 c1_nxt7 = *(c + 1 + 7 * ldc);
1216 for (k = 0; k < bk; k++)
1218 c0 -= aa[0] * bb[0];
1219 c1 -= aa[1] * bb[0];
1220 c0_nxt1 -= aa[0] * bb[1];
1221 c1_nxt1 -= aa[1] * bb[1];
1222 c0_nxt2 -= aa[0] * bb[2];
1223 c1_nxt2 -= aa[1] * bb[2];
1224 c0_nxt3 -= aa[0] * bb[3];
1225 c1_nxt3 -= aa[1] * bb[3];
1226 c0_nxt4 -= aa[0] * bb[4];
1227 c1_nxt4 -= aa[1] * bb[4];
1228 c0_nxt5 -= aa[0] * bb[5];
1229 c1_nxt5 -= aa[1] * bb[5];
1230 c0_nxt6 -= aa[0] * bb[6];
1231 c1_nxt6 -= aa[1] * bb[6];
1232 c0_nxt7 -= aa[0] * bb[7];
1233 c1_nxt7 -= aa[1] * bb[7];
1256 c0_nxt1 -= c1_nxt1 * a2;
1257 c0_nxt2 -= c1_nxt2 * a2;
1258 c0_nxt3 -= c1_nxt3 * a2;
1259 c0_nxt4 -= c1_nxt4 * a2;
1260 c0_nxt5 -= c1_nxt5 * a2;
1261 c0_nxt6 -= c1_nxt6 * a2;
1262 c0_nxt7 -= c1_nxt7 * a2;
1283 *(b + 10) = c1_nxt2;
1284 *(b + 11) = c1_nxt3;
1285 *(b + 12) = c1_nxt4;
1286 *(b + 13) = c1_nxt5;
1287 *(b + 14) = c1_nxt6;
1288 *(b + 15) = c1_nxt7;
1292 *(c + 0 + 1 * ldc) = c0_nxt1;
1293 *(c + 1 + 1 * ldc) = c1_nxt1;
1294 *(c + 0 + 2 * ldc) = c0_nxt2;
1295 *(c + 1 + 2 * ldc) = c1_nxt2;
1296 *(c + 0 + 3 * ldc) = c0_nxt3;
1297 *(c + 1 + 3 * ldc) = c1_nxt3;
1298 *(c + 0 + 4 * ldc) = c0_nxt4;
1299 *(c + 1 + 4 * ldc) = c1_nxt4;
1300 *(c + 0 + 5 * ldc) = c0_nxt5;
1301 *(c + 1 + 5 * ldc) = c1_nxt5;
1302 *(c + 0 + 6 * ldc) = c0_nxt6;
1303 *(c + 1 + 6 * ldc) = c1_nxt6;
1304 *(c + 0 + 7 * ldc) = c0_nxt7;
1305 *(c + 1 + 7 * ldc) = c1_nxt7;
1308 static void ssolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1311 FLOAT *aa = a, *bb = b;
1312 FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1;
1313 FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
1317 c0_nxt1 = *(c + 0 + ldc);
1318 c1_nxt1 = *(c + 1 + ldc);
1319 c0_nxt2 = *(c + 0 + 2 * ldc);
1320 c1_nxt2 = *(c + 1 + 2 * ldc);
1321 c0_nxt3 = *(c + 0 + 3 * ldc);
1322 c1_nxt3 = *(c + 1 + 3 * ldc);
1324 for (k = 0; k < bk; k++)
1326 c0 -= aa[0] * bb[0];
1327 c1 -= aa[1] * bb[0];
1328 c0_nxt1 -= aa[0] * bb[1];
1329 c1_nxt1 -= aa[1] * bb[1];
1330 c0_nxt2 -= aa[0] * bb[2];
1331 c1_nxt2 -= aa[1] * bb[2];
1332 c0_nxt3 -= aa[0] * bb[3];
1333 c1_nxt3 -= aa[1] * bb[3];
1352 c0_nxt1 -= c1_nxt1 * a2;
1353 c0_nxt2 -= c1_nxt2 * a2;
1354 c0_nxt3 -= c1_nxt3 * a2;
1372 *(c + 0 + ldc) = c0_nxt1;
1373 *(c + 1 + ldc) = c1_nxt1;
1374 *(c + 0 + 2 * ldc) = c0_nxt2;
1375 *(c + 1 + 2 * ldc) = c1_nxt2;
1376 *(c + 0 + 3 * ldc) = c0_nxt3;
1377 *(c + 1 + 3 * ldc) = c1_nxt3;
1380 static void ssolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1383 FLOAT *aa = a, *bb = b;
1384 FLOAT a0, a2, a3, c0, c1, c0_nxt, c1_nxt;
1388 c0_nxt = *(c + 0 + ldc);
1389 c1_nxt = *(c + 1 + ldc);
1391 for (k = 0; k < bk; k++)
1393 c0 -= aa[0] * bb[0];
1394 c1 -= aa[1] * bb[0];
1395 c0_nxt -= aa[0] * bb[1];
1396 c1_nxt -= aa[1] * bb[1];
1413 c0_nxt -= c1_nxt * a2;
1425 *(c + 0 + ldc) = c0_nxt;
1426 *(c + 1 + ldc) = c1_nxt;
1429 static void ssolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
1432 FLOAT *aa = a, *bb = b;
1433 FLOAT a0, a2, a3, c0, c1;
1438 for (k = 0; k < bk; k++)
1440 c0 -= aa[0] * bb[0];
1441 c1 -= aa[1] * bb[0];
1466 static void ssolve_1x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1469 FLOAT *aa = a, *bb = b;
1470 FLOAT a0, c0, c1, c2, c3, c4, c5, c6, c7;
1473 c1 = *(c + 1 * ldc);
1474 c2 = *(c + 2 * ldc);
1475 c3 = *(c + 3 * ldc);
1476 c4 = *(c + 4 * ldc);
1477 c5 = *(c + 5 * ldc);
1478 c6 = *(c + 6 * ldc);
1479 c7 = *(c + 7 * ldc);
1481 for (k = 0; k < bk; k++)
1483 c0 -= aa[0] * bb[0];
1484 c1 -= aa[0] * bb[1];
1485 c2 -= aa[0] * bb[2];
1486 c3 -= aa[0] * bb[3];
1487 c4 -= aa[0] * bb[4];
1488 c5 -= aa[0] * bb[5];
1489 c6 -= aa[0] * bb[6];
1490 c7 -= aa[0] * bb[7];
1516 *(c + 0 * ldc) = c0;
1517 *(c + 1 * ldc) = c1;
1518 *(c + 2 * ldc) = c2;
1519 *(c + 3 * ldc) = c3;
1520 *(c + 4 * ldc) = c4;
1521 *(c + 5 * ldc) = c5;
1522 *(c + 6 * ldc) = c6;
1523 *(c + 7 * ldc) = c7;
1526 static void ssolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1529 FLOAT *aa = a, *bb = b;
1530 FLOAT a0, c0, c1, c2, c3;
1532 c0 = *(c + 0 * ldc);
1533 c1 = *(c + 1 * ldc);
1534 c2 = *(c + 2 * ldc);
1535 c3 = *(c + 3 * ldc);
1537 for (k = 0; k < bk; k++)
1539 c0 -= aa[0] * bb[0];
1540 c1 -= aa[0] * bb[1];
1541 c2 -= aa[0] * bb[2];
1542 c3 -= aa[0] * bb[3];
1560 *(c + 0 * ldc) = c0;
1561 *(c + 1 * ldc) = c1;
1562 *(c + 2 * ldc) = c2;
1563 *(c + 3 * ldc) = c3;
1566 static void ssolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1569 FLOAT *aa = a, *bb = b;
1575 for (k = 0; k < bk; k++)
1577 c0 -= aa[0] * bb[0];
1578 c1 -= aa[0] * bb[1];
1592 *(c + 0 * ldc) = c0;
1593 *(c + 1 * ldc) = c1;
1596 static void ssolve_1x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
1600 for (k = 0; k < bk; k++)
1609 int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
1610 FLOAT *c, BLASLONG ldc, BLASLONG offset)
1615 for (j = (n >> 3); j--;)
1622 aa = a + (m - 1) * k + kk;
1625 ssolve_1x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk));
1632 aa = a + ((m & ~1) - 2) * k + 2 * kk;
1633 cc = c + ((m & ~1) - 2);
1635 ssolve_2x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk));
1642 aa = a + ((m & ~3) - 4) * k + 4 * kk;
1643 cc = c + ((m & ~3) - 4);
1645 ssolve_4x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk));
1654 aa = a + ((m & ~7) - 8) * k;
1655 cc = c + ((m & ~7) - 8);
1659 ssolve_8x8_ln_msa(aa + 8 * kk, b + 8 * kk, cc, ldc, (k - kk));
1682 aa = a + (m - 1) * k + kk;
1685 ssolve_1x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk));
1692 aa = a + ((m & ~1) - 2) * k + 2 * kk;
1693 cc = c + ((m & ~1) - 2);
1695 ssolve_2x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk));
1702 aa = a + ((m & ~3) - 4) * k + 4 * kk;
1703 cc = c + ((m & ~3) - 4);
1705 ssolve_4x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk));
1714 aa = a + ((m & ~7) - 8) * k;
1715 cc = c + ((m & ~7) - 8);
1719 ssolve_8x4_ln_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk));
1740 aa = a + (m - 1) * k + kk;
1743 ssolve_1x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk));
1750 aa = a + ((m & ~1) - 2) * k + 2 * kk;
1751 cc = c + ((m & ~1) - 2);
1753 ssolve_2x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk));
1760 aa = a + ((m & ~3) - 4) * k + 4 * kk;
1761 cc = c + ((m & ~3) - 4);
1763 ssolve_4x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk));
1772 aa = a + ((m & ~7) - 8) * k;
1773 cc = c + ((m & ~7) - 8);
1777 ssolve_8x2_ln_msa(aa + 8 * kk, b + 2 * kk, cc, ldc, (k - kk));
1798 aa = a + (m - 1) * k + kk;
1801 ssolve_1x1_ln_msa(aa, b + kk, cc, (k - kk));
1808 aa = a + ((m & ~1) - 2) * k + 2 * kk;
1809 cc = c + ((m & ~1) - 2);
1811 ssolve_2x1_ln_msa(aa, b + kk, cc, (k - kk));
1818 aa = a + ((m & ~3) - 4) * k + 4 * kk;
1819 cc = c + ((m & ~3) - 4);
1821 ssolve_4x1_ln_msa(aa, b + kk, cc, (k - kk));
1830 aa = a + ((m & ~7) - 8) * k;
1831 cc = c + ((m & ~7) - 8);
1835 ssolve_8x1_ln_msa(aa + 8 * kk, b + kk, cc, (k - kk));