1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 static __attribute__ ((noinline))
32 void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
34 v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
35 v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
36 v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
37 v2f64 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15;
38 v2f64 src_a0, src_a1, src_a2, src_a3, src_a8, src_a9, src_a16, src_a17;
39 v2f64 src_a18, src_a24, src_a25, src_a26, src_a27, src_a32, src_a33;
40 v2f64 src_a34, src_a35, src_a36, src_a40, src_a41, src_a42, src_a43;
41 v2f64 src_a44, src_a45, src_a48, src_a49, src_a50, src_a51, src_a52;
42 v2f64 src_a53, src_a54, src_a56, src_a57, src_a58, src_a59, src_a60;
43 v2f64 src_a61, src_a62, src_a63;
44 FLOAT *c_nxt1line = c + ldc;
45 FLOAT *c_nxt2line = c + 2 * ldc;
46 FLOAT *c_nxt3line = c + 3 * ldc;
61 LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
62 LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7);
63 LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
64 LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15);
68 BLASLONG i, pref_offset;
69 FLOAT *pba = a, *pbb = b, *pa0_pref;
70 v2f64 src_b, src_b0, src_b1;
72 pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1);
76 pref_offset = L1_DATA_LINESIZE - pref_offset;
77 pref_offset = pref_offset / sizeof(FLOAT);
80 pa0_pref = a + pref_offset;
82 for (i = bk >> 1; i--;)
84 PREF_OFFSET(pa0_pref, 128);
85 PREF_OFFSET(pa0_pref, 160);
86 PREF_OFFSET(pa0_pref, 192);
87 PREF_OFFSET(pa0_pref, 224);
89 LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
90 LD_DP2_INC(pbb, 2, src_b0, src_b1);
92 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
93 src_c0 -= src_a0 * src_b;
94 src_c1 -= src_a1 * src_b;
95 src_c2 -= src_a2 * src_b;
96 src_c3 -= src_a3 * src_b;
98 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
99 src_c4 -= src_a0 * src_b;
100 src_c5 -= src_a1 * src_b;
101 src_c6 -= src_a2 * src_b;
102 src_c7 -= src_a3 * src_b;
104 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
105 src_c8 -= src_a0 * src_b;
106 src_c9 -= src_a1 * src_b;
107 src_c10 -= src_a2 * src_b;
108 src_c11 -= src_a3 * src_b;
110 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
111 src_c12 -= src_a0 * src_b;
112 src_c13 -= src_a1 * src_b;
113 src_c14 -= src_a2 * src_b;
114 src_c15 -= src_a3 * src_b;
116 LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
117 LD_DP2_INC(pbb, 2, src_b0, src_b1);
119 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
120 src_c0 -= src_a0 * src_b;
121 src_c1 -= src_a1 * src_b;
122 src_c2 -= src_a2 * src_b;
123 src_c3 -= src_a3 * src_b;
125 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
126 src_c4 -= src_a0 * src_b;
127 src_c5 -= src_a1 * src_b;
128 src_c6 -= src_a2 * src_b;
129 src_c7 -= src_a3 * src_b;
131 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
132 src_c8 -= src_a0 * src_b;
133 src_c9 -= src_a1 * src_b;
134 src_c10 -= src_a2 * src_b;
135 src_c11 -= src_a3 * src_b;
137 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
138 src_c12 -= src_a0 * src_b;
139 src_c13 -= src_a1 * src_b;
140 src_c14 -= src_a2 * src_b;
141 src_c15 -= src_a3 * src_b;
148 LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
149 LD_DP2_INC(pbb, 2, src_b0, src_b1);
151 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
152 src_c0 -= src_a0 * src_b;
153 src_c1 -= src_a1 * src_b;
154 src_c2 -= src_a2 * src_b;
155 src_c3 -= src_a3 * src_b;
157 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
158 src_c4 -= src_a0 * src_b;
159 src_c5 -= src_a1 * src_b;
160 src_c6 -= src_a2 * src_b;
161 src_c7 -= src_a3 * src_b;
163 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
164 src_c8 -= src_a0 * src_b;
165 src_c9 -= src_a1 * src_b;
166 src_c10 -= src_a2 * src_b;
167 src_c11 -= src_a3 * src_b;
169 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
170 src_c12 -= src_a0 * src_b;
171 src_c13 -= src_a1 * src_b;
172 src_c14 -= src_a2 * src_b;
173 src_c15 -= src_a3 * src_b;
180 ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1);
181 ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3);
182 ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5);
183 ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7);
184 ILVRL_D2_DP(src_c12, src_c8, res_c8, res_c9);
185 ILVRL_D2_DP(src_c13, src_c9, res_c10, res_c11);
186 ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13);
187 ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15);
189 src_a54 = __msa_cast_to_vector_double(*(a + 54));
190 src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
191 src_a62 = LD_DP(a + 62);
192 src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1);
193 src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0);
194 src_a60 = LD_DP(a + 60);
195 src_a61 = (v2f64) __msa_splati_d((v2i64) src_a60, 1);
196 src_a60 = (v2f64) __msa_splati_d((v2i64) src_a60, 0);
197 src_a52 = LD_DP(a + 52);
198 src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1);
199 src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0);
200 src_a44 = LD_DP(a + 44);
201 src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1);
202 src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0);
203 src_a36 = __msa_cast_to_vector_double(*(a + 36));
204 src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
207 res_c6 -= res_c7 * src_a62;
211 res_c14 -= res_c15 * src_a62;
214 ST_DP(res_c7, b + 28);
215 ST_DP(res_c6, b + 24);
216 ST_DP(res_c15, b + 30);
217 ST_DP(res_c14, b + 26);
218 ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7);
219 ILVRL_D2_DP(res_c15, res_c14, src_c11, src_c15);
220 ST_DP(src_c3, c + 6);
221 ST_DP(src_c7, c_nxt1line + 6);
222 ST_DP(src_c11, c_nxt2line + 6);
223 ST_DP(src_c15, c_nxt3line + 6);
225 res_c5 -= res_c7 * src_a61;
226 res_c5 -= res_c6 * src_a53;
229 res_c4 -= res_c7 * src_a60;
230 res_c4 -= res_c6 * src_a52;
231 res_c4 -= res_c5 * src_a44;
234 res_c13 -= res_c15 * src_a61;
235 res_c13 -= res_c14 * src_a53;
238 res_c12 -= res_c15 * src_a60;
239 res_c12 -= res_c14 * src_a52;
240 res_c12 -= res_c13 * src_a44;
243 src_a56 = LD_DP(a + 56);
244 src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1);
245 src_a56 = (v2f64) __msa_splati_d((v2i64) src_a56, 0);
246 src_a58 = LD_DP(a + 58);
247 src_a59 = (v2f64) __msa_splati_d((v2i64) src_a58, 1);
248 src_a58 = (v2f64) __msa_splati_d((v2i64) src_a58, 0);
250 ST_DP(res_c4, b + 16);
251 ST_DP(res_c5, b + 20);
252 ST_DP(res_c12, b + 18);
253 ST_DP(res_c13, b + 22);
255 ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
256 ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14);
257 ST_DP(src_c2, c + 4);
258 ST_DP(src_c6, c_nxt1line + 4);
259 ST_DP(src_c10, c_nxt2line + 4);
260 ST_DP(src_c14, c_nxt3line + 4);
262 src_a50 = LD_DP(a + 50);
263 src_a51 = (v2f64) __msa_splati_d((v2i64) src_a50, 1);
264 src_a50 = (v2f64) __msa_splati_d((v2i64) src_a50, 0);
265 src_a42 = LD_DP(a + 42);
266 src_a43 = (v2f64) __msa_splati_d((v2i64) src_a42, 1);
267 src_a42 = (v2f64) __msa_splati_d((v2i64) src_a42, 0);
268 src_a34 = LD_DP(a + 34);
269 src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1);
270 src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0);
271 src_a26 = LD_DP(a + 26);
272 src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1);
273 src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0);
274 src_a18 = __msa_cast_to_vector_double(*(a + 18));
275 src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
277 res_c3 -= res_c7 * src_a59;
278 res_c2 -= res_c7 * src_a58;
279 res_c1 -= res_c7 * src_a57;
280 res_c0 -= res_c7 * src_a56;
282 res_c11 -= res_c15 * src_a59;
283 res_c10 -= res_c15 * src_a58;
284 res_c9 -= res_c15 * src_a57;
285 res_c8 -= res_c15 * src_a56;
287 res_c3 -= res_c6 * src_a51;
288 res_c3 -= res_c5 * src_a43;
289 res_c3 -= res_c4 * src_a35;
292 res_c2 -= res_c6 * src_a50;
293 res_c2 -= res_c5 * src_a42;
294 res_c2 -= res_c4 * src_a34;
295 res_c2 -= res_c3 * src_a26;
298 res_c11 -= res_c14 * src_a51;
299 res_c11 -= res_c13 * src_a43;
300 res_c11 -= res_c12 * src_a35;
303 res_c10 -= res_c14 * src_a50;
304 res_c10 -= res_c13 * src_a42;
305 res_c10 -= res_c12 * src_a34;
306 res_c10 -= res_c11 * src_a26;
309 src_a48 = LD_DP(a + 48);
310 src_a49 = (v2f64) __msa_splati_d((v2i64) src_a48, 1);
311 src_a48 = (v2f64) __msa_splati_d((v2i64) src_a48, 0);
312 src_a40 = LD_DP(a + 40);
313 src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1);
314 src_a40 = (v2f64) __msa_splati_d((v2i64) src_a40, 0);
316 ST_DP(res_c2, b + 8);
317 ST_DP(res_c3, b + 12);
318 ST_DP(res_c10, b + 10);
319 ST_DP(res_c11, b + 14);
321 src_a32 = LD_DP(a + 32);
322 src_a33 = (v2f64) __msa_splati_d((v2i64) src_a32, 1);
323 src_a32 = (v2f64) __msa_splati_d((v2i64) src_a32, 0);
324 src_a24 = LD_DP(a + 24);
325 src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1);
326 src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0);
328 ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5);
329 ILVRL_D2_DP(res_c11, res_c10, src_c9, src_c13);
330 ST_DP(src_c1, c + 2);
331 ST_DP(src_c5, c_nxt1line + 2);
332 ST_DP(src_c9, c_nxt2line + 2);
333 ST_DP(src_c13, c_nxt3line + 2);
335 res_c1 -= res_c6 * src_a49;
336 res_c1 -= res_c5 * src_a41;
337 res_c1 -= res_c4 * src_a33;
338 res_c1 -= res_c3 * src_a25;
340 res_c0 -= res_c6 * src_a48;
341 res_c0 -= res_c5 * src_a40;
342 res_c0 -= res_c4 * src_a32;
343 res_c0 -= res_c3 * src_a24;
345 res_c9 -= res_c14 * src_a49;
346 res_c9 -= res_c13 * src_a41;
347 res_c9 -= res_c12 * src_a33;
348 res_c9 -= res_c11 * src_a25;
350 res_c8 -= res_c14 * src_a48;
351 res_c8 -= res_c13 * src_a40;
352 res_c8 -= res_c12 * src_a32;
353 res_c8 -= res_c11 * src_a24;
355 src_a16 = LD_DP(a + 16);
356 src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1);
357 src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0);
358 src_a8 = LD_DP(a + 8);
359 src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1);
360 src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
361 src_a0 = __msa_cast_to_vector_double(*(a + 0));
362 src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
364 res_c1 -= res_c2 * src_a17;
367 res_c9 -= res_c10 * src_a17;
370 res_c0 -= res_c2 * src_a16;
371 res_c0 -= res_c1 * src_a8;
374 res_c8 -= res_c10 * src_a16;
375 res_c8 -= res_c9 * src_a8;
378 ST_DP(res_c0, b + 0);
379 ST_DP(res_c8, b + 2);
380 ST_DP(res_c1, b + 4);
381 ST_DP(res_c9, b + 6);
383 ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4);
384 ILVRL_D2_DP(res_c9, res_c8, src_c8, src_c12);
387 ST_DP(src_c4, c_nxt1line);
388 ST_DP(src_c8, c_nxt2line);
389 ST_DP(src_c12, c_nxt3line);
392 static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
394 v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
395 v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
396 v2f64 src_a0, src_a1, src_a2, src_a3, src_a8, src_a9, src_a16, src_a17;
397 v2f64 src_a18, src_a24, src_a25, src_a26, src_a27, src_a32, src_a33;
398 v2f64 src_a34, src_a35, src_a36, src_a40, src_a41, src_a42, src_a43;
399 v2f64 src_a44, src_a45, src_a48, src_a49, src_a50, src_a51, src_a52;
400 v2f64 src_a53, src_a54, src_a56, src_a57, src_a58, src_a59, src_a60;
401 v2f64 src_a61, src_a62, src_a63;
403 LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
404 LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7);
409 FLOAT *pba = a, *pbb = b;
410 v2f64 src_b, src_b0, src_b1;
412 LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3);
415 for (i = bk - 1; i--;)
420 LD_DP4(pba, 2, src_a8, src_a9, src_a16, src_a17);
423 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
424 src_c0 -= src_a0 * src_b;
425 src_c1 -= src_a1 * src_b;
426 src_c2 -= src_a2 * src_b;
427 src_c3 -= src_a3 * src_b;
429 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
430 src_c4 -= src_a0 * src_b;
431 src_c5 -= src_a1 * src_b;
432 src_c6 -= src_a2 * src_b;
433 src_c7 -= src_a3 * src_b;
442 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
443 src_c0 -= src_a0 * src_b;
444 src_c1 -= src_a1 * src_b;
445 src_c2 -= src_a2 * src_b;
446 src_c3 -= src_a3 * src_b;
448 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
449 src_c4 -= src_a0 * src_b;
450 src_c5 -= src_a1 * src_b;
451 src_c6 -= src_a2 * src_b;
452 src_c7 -= src_a3 * src_b;
455 ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1);
456 ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3);
457 ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5);
458 ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7);
460 src_a56 = LD_DP(a - 8);
461 src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1);
462 src_a56 = (v2f64) __msa_splati_d((v2i64) src_a56, 0);
463 src_a58 = LD_DP(a - 6);
464 src_a59 = (v2f64) __msa_splati_d((v2i64) src_a58, 1);
465 src_a58 = (v2f64) __msa_splati_d((v2i64) src_a58, 0);
466 src_a60 = LD_DP(a - 4);
467 src_a61 = (v2f64) __msa_splati_d((v2i64) src_a60, 1);
468 src_a60 = (v2f64) __msa_splati_d((v2i64) src_a60, 0);
469 src_a62 = LD_DP(a - 2);
470 src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1);
471 src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0);
474 res_c6 -= res_c7 * src_a62;
475 res_c5 -= res_c7 * src_a61;
476 res_c4 -= res_c7 * src_a60;
477 res_c3 -= res_c7 * src_a59;
478 res_c2 -= res_c7 * src_a58;
479 res_c1 -= res_c7 * src_a57;
480 res_c0 -= res_c7 * src_a56;
482 src_a48 = LD_DP(a - 16);
483 src_a49 = (v2f64) __msa_splati_d((v2i64) src_a48, 1);
484 src_a48 = (v2f64) __msa_splati_d((v2i64) src_a48, 0);
485 src_a50 = LD_DP(a - 14);
486 src_a51 = (v2f64) __msa_splati_d((v2i64) src_a50, 1);
487 src_a50 = (v2f64) __msa_splati_d((v2i64) src_a50, 0);
488 src_a52 = LD_DP(a - 12);
489 src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1);
490 src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0);
491 src_a54 = __msa_cast_to_vector_double(*(a - 10));
492 src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
494 src_a40 = LD_DP(a - 24);
495 src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1);
496 src_a40 = (v2f64) __msa_splati_d((v2i64) src_a40, 0);
497 src_a42 = LD_DP(a - 22);
498 src_a43 = (v2f64) __msa_splati_d((v2i64) src_a42, 1);
499 src_a42 = (v2f64) __msa_splati_d((v2i64) src_a42, 0);
500 src_a44 = LD_DP(a - 20);
501 src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1);
502 src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0);
505 res_c5 -= res_c6 * src_a53;
506 res_c4 -= res_c6 * src_a52;
507 res_c3 -= res_c6 * src_a51;
508 res_c2 -= res_c6 * src_a50;
509 res_c1 -= res_c6 * src_a49;
510 res_c0 -= res_c6 * src_a48;
513 res_c4 -= res_c5 * src_a44;
514 res_c3 -= res_c5 * src_a43;
515 res_c2 -= res_c5 * src_a42;
516 res_c1 -= res_c5 * src_a41;
517 res_c0 -= res_c5 * src_a40;
519 ST_DP(res_c7, b - 2);
520 ST_DP(res_c6, b - 4);
521 ST_DP(res_c5, b - 6);
523 src_a32 = LD_DP(a - 32);
524 src_a33 = (v2f64) __msa_splati_d((v2i64) src_a32, 1);
525 src_a32 = (v2f64) __msa_splati_d((v2i64) src_a32, 0);
526 src_a34 = LD_DP(a - 30);
527 src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1);
528 src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0);
529 src_a36 = __msa_cast_to_vector_double(*(a - 28));
530 src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
533 res_c3 -= res_c4 * src_a35;
534 res_c2 -= res_c4 * src_a34;
535 res_c1 -= res_c4 * src_a33;
536 res_c0 -= res_c4 * src_a32;
538 src_a24 = LD_DP(a - 40);
539 src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1);
540 src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0);
541 src_a26 = LD_DP(a - 38);
542 src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1);
543 src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0);
544 src_a16 = LD_DP(a - 48);
545 src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1);
546 src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0);
547 src_a18 = __msa_cast_to_vector_double(*(a - 46));
548 src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
549 src_a0 = __msa_cast_to_vector_double(*(a - 64));
550 src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
551 src_a8 = LD_DP(a - 56);
552 src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1);
553 src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
556 res_c2 -= res_c3 * src_a26;
557 res_c1 -= res_c3 * src_a25;
558 res_c0 -= res_c3 * src_a24;
561 res_c1 -= res_c2 * src_a17;
562 res_c0 -= res_c2 * src_a16;
565 res_c0 -= res_c1 * src_a8;
569 ST_DP(res_c4, b - 8);
570 ST_DP(res_c3, b - 10);
571 ST_DP(res_c2, b - 12);
572 ST_DP(res_c1, b - 14);
573 ST_DP(res_c0, b - 16);
575 ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4);
576 ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5);
577 ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
578 ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7);
580 ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
581 ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2);
584 static void dsolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
586 FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35;
587 FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53;
588 FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63;
589 FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
603 FLOAT *aa = a, *bb = b;
724 static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
726 v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
727 v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
728 v2f64 src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12, src_a13;
729 v2f64 src_a14, src_a15;
731 LD_DP2(c, 2, src_c0, src_c1);
732 LD_DP2(c + ldc, 2, src_c2, src_c3);
733 LD_DP2(c + 2 * ldc, 2, src_c4, src_c5);
734 LD_DP2(c + 3 * ldc, 2, src_c6, src_c7);
739 FLOAT *aa = a, *bb = b;
740 v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
744 LD_DP2(aa, 2, src_a0, src_a1);
745 LD_DP2(bb, 2, src_b0, src_b1);
747 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
748 src_c0 -= src_a0 * src_b;
749 src_c1 -= src_a1 * src_b;
751 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
752 src_c2 -= src_a0 * src_b;
753 src_c3 -= src_a1 * src_b;
755 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
756 src_c4 -= src_a0 * src_b;
757 src_c5 -= src_a1 * src_b;
759 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
760 src_c6 -= src_a0 * src_b;
761 src_c7 -= src_a1 * src_b;
771 ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1);
772 ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3);
773 ILVRL_D2_DP(src_c6, src_c4, res_c4, res_c5);
774 ILVRL_D2_DP(src_c7, src_c5, res_c6, res_c7);
776 src_a14 = LD_DP(a + 14);
777 src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1);
778 src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0);
780 src_a12 = LD_DP(a + 12);
781 src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1);
782 src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0);
784 src_a9 = LD_DP(a + 9);
785 src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1);
786 src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
788 src_a8 = __msa_cast_to_vector_double(*(a + 8));
789 src_a0 = __msa_cast_to_vector_double(*(a + 0));
791 src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
792 src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
794 src_a4 = LD_DP(a + 4);
795 src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
796 src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0);
801 res_c2 -= res_c3 * src_a14;
802 res_c6 -= res_c7 * src_a14;
806 res_c1 -= res_c3 * src_a13;
807 res_c5 -= res_c7 * src_a13;
808 res_c1 -= res_c2 * src_a9;
809 res_c5 -= res_c6 * src_a9;
813 res_c0 -= res_c3 * src_a12;
814 res_c4 -= res_c7 * src_a12;
815 res_c0 -= res_c2 * src_a8;
816 res_c4 -= res_c6 * src_a8;
817 res_c0 -= res_c1 * src_a4;
818 res_c4 -= res_c5 * src_a4;
822 ST_DP(res_c7, b + 14);
823 ST_DP(res_c3, b + 12);
824 ST_DP(res_c6, b + 10);
825 ST_DP(res_c2, b + 8);
826 ST_DP(res_c5, b + 6);
827 ST_DP(res_c1, b + 4);
828 ST_DP(res_c4, b + 2);
829 ST_DP(res_c0, b + 0);
831 ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2);
832 ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3);
833 ILVRL_D2_DP(res_c5, res_c4, src_c4, src_c6);
834 ILVRL_D2_DP(res_c7, res_c6, src_c5, src_c7);
836 ST_DP2(src_c0, src_c1, c, 2);
837 ST_DP2(src_c2, src_c3, c + ldc, 2);
838 ST_DP2(src_c4, src_c5, c + 2 * ldc, 2);
839 ST_DP2(src_c6, src_c7, c + 3 * ldc, 2);
842 static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
844 v2f64 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3;
845 v2f64 src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12, src_a13;
846 v2f64 src_a14, src_a15;
848 LD_DP2(c, 2, src_c0, src_c1);
849 LD_DP2(c + ldc, 2, src_c2, src_c3);
854 FLOAT *aa = a, *bb = b;
855 v2f64 src_a0, src_a1, src_b, src_b0;
859 LD_DP2(aa, 2, src_a0, src_a1);
862 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
863 src_c0 -= src_a0 * src_b;
864 src_c1 -= src_a1 * src_b;
866 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
867 src_c2 -= src_a0 * src_b;
868 src_c3 -= src_a1 * src_b;
878 ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1);
879 ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3);
881 src_a14 = LD_DP(a + 14);
882 src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1);
883 src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0);
885 src_a12 = LD_DP(a + 12);
886 src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1);
887 src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0);
889 src_a9 = LD_DP(a + 9);
890 src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1);
891 src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
893 src_a8 = __msa_cast_to_vector_double(*(a + 8));
894 src_a0 = __msa_cast_to_vector_double(*(a + 0));
896 src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
897 src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
899 src_a4 = LD_DP(a + 4);
900 src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
901 src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0);
905 res_c2 -= res_c3 * src_a14;
908 res_c1 -= res_c3 * src_a13;
909 res_c1 -= res_c2 * src_a9;
912 res_c0 -= res_c3 * src_a12;
913 res_c0 -= res_c2 * src_a8;
914 res_c0 -= res_c1 * src_a4;
917 ST_DP(res_c3, b + 6);
918 ST_DP(res_c2, b + 4);
919 ST_DP(res_c1, b + 2);
920 ST_DP(res_c0, b + 0);
922 ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2);
923 ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3);
925 ST_DP2(src_c0, src_c1, c, 2);
926 ST_DP2(src_c2, src_c3, c + ldc, 2);
929 static void dsolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
931 FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15, c0, c1, c2, c3;
941 FLOAT *aa = a, *bb = b;
994 static void dsolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
996 FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1;
997 FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
1001 c0_nxt1 = *(c + 0 + ldc);
1002 c1_nxt1 = *(c + 1 + ldc);
1003 c0_nxt2 = *(c + 0 + 2 * ldc);
1004 c1_nxt2 = *(c + 1 + 2 * ldc);
1005 c0_nxt3 = *(c + 0 + 3 * ldc);
1006 c1_nxt3 = *(c + 1 + 3 * ldc);
1011 FLOAT *aa = a, *bb = b;
1015 c0 -= aa[0] * bb[0];
1016 c1 -= aa[1] * bb[0];
1017 c0_nxt1 -= aa[0] * bb[1];
1018 c1_nxt1 -= aa[1] * bb[1];
1019 c0_nxt2 -= aa[0] * bb[2];
1020 c1_nxt2 -= aa[1] * bb[2];
1021 c0_nxt3 -= aa[0] * bb[3];
1022 c1_nxt3 -= aa[1] * bb[3];
1041 c0_nxt1 -= c1_nxt1 * a2;
1045 c0_nxt2 -= c1_nxt2 * a2;
1049 c0_nxt3 -= c1_nxt3 * a2;
1063 *(c + 0 + ldc) = c0_nxt1;
1064 *(c + 1 + ldc) = c1_nxt1;
1065 *(c + 0 + 2 * ldc) = c0_nxt2;
1066 *(c + 1 + 2 * ldc) = c1_nxt2;
1067 *(c + 0 + 3 * ldc) = c0_nxt3;
1068 *(c + 1 + 3 * ldc) = c1_nxt3;
1071 static void dsolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1073 FLOAT a0, a2, a3, c0, c1, c0_nxt, c1_nxt;
1078 c0_nxt = *(c + 0 + ldc);
1079 c1_nxt = *(c + 1 + ldc);
1084 FLOAT *aa = a, *bb = b;
1088 c0 -= aa[0] * bb[0];
1089 c1 -= aa[1] * bb[0];
1091 c0_nxt -= aa[0] * bb[1];
1092 c1_nxt -= aa[1] * bb[1];
1113 c0_nxt -= c1_nxt * a2;
1124 *(c + 0 + ldc) = c0_nxt;
1125 *(c + 1 + ldc) = c1_nxt;
1128 static void dsolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
1130 FLOAT a0, a2, a3, c0, c1;
1138 FLOAT *aa = a, *bb = b;
1142 c0 -= aa[0] * bb[0];
1143 c1 -= aa[1] * bb[0];
1165 static void dsolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1167 FLOAT c0, c1, c2, c3;
1170 c1 = *(c + 1 * ldc);
1171 c2 = *(c + 2 * ldc);
1172 c3 = *(c + 3 * ldc);
1177 FLOAT *aa = a, *bb = b;
1181 c0 -= aa[0] * bb[0];
1182 c1 -= aa[0] * bb[1];
1183 c2 -= aa[0] * bb[2];
1184 c3 -= aa[0] * bb[3];
1196 *(c + 0 * ldc) = c0;
1197 *(c + 1 * ldc) = c1;
1198 *(c + 2 * ldc) = c2;
1199 *(c + 3 * ldc) = c3;
1207 static void dsolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc)
1210 *(c + ldc) = *a * *(c + ldc);
1213 *(b + 1) = *(c + ldc);
1216 int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
1217 FLOAT *c, BLASLONG ldc, BLASLONG offset)
1220 FLOAT *aa, *bb, *cc;
1222 for (j = (n >> 2); j--;)
1230 aa = a + (m - 1) * k + kk;
1234 dsolve_1x4_ln_msa(aa, bb, cc, ldc, (k - kk));
1241 aa = a + ((m & -2) - 2) * k + 2 * kk;
1243 cc = c + ((m & -2) - 2);
1245 dsolve_2x4_ln_msa(aa, bb, cc, ldc, (k - kk));
1252 aa = a + ((m & -4) - 4) * k + 4 * kk;
1254 cc = c + ((m & -4) - 4);
1256 dsolve_4x4_ln_msa(aa, bb, cc, ldc, (k - kk));
1265 aa = a + ((m & -8) - 8) * k;
1266 cc = c + ((m & -8) - 8);
1270 dsolve_8x4_ln_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk));
1293 aa = a + ((m & -1) - 1) * k;
1294 cc = c + ((m & -1) - 1);
1296 dsolve_1x2_ln_msa(aa + kk - 1, b + kk * 2 - 2, cc, ldc);
1303 aa = a + ((m & -2) - 2) * k;
1304 cc = c + ((m & -2) - 2);
1306 dsolve_2x2_ln_msa(aa + kk * 2, b + kk * 2, cc, ldc, (k - kk));
1313 aa = a + ((m & -4) - 4) * k;
1314 cc = c + ((m & -4) - 4);
1316 dsolve_4x2_ln_msa(aa + kk * 4, b + kk * 2, cc, ldc, (k - kk));
1325 aa = a + ((m & -8) - 8) * k;
1326 cc = c + ((m & -8) - 8);
1330 dsolve_8x2_ln_msa(aa + kk * 8, b + kk * 2, cc, ldc, (k - kk));
1352 aa = a + ((m & -1) - 1) * k + kk;
1353 cc = c + ((m & -1) - 1);
1361 aa = a + ((m & -2) - 2) * k + kk * 2;
1362 cc = c + ((m & -2) - 2);
1364 dsolve_2x1_ln_msa(aa, b + kk, cc, (k - kk));
1371 aa = a + ((m & -4) - 4) * k;
1372 cc = c + ((m & -4) - 4);
1374 dsolve_4x1_ln_msa(aa + 4 * kk, b + kk, cc, (k - kk));
1383 aa = a + ((m & -8) - 8) * k;
1384 cc = c + ((m & -8) - 8);
1388 dsolve_8x1_ln_msa(aa + 8 * kk, b + kk, cc, (k - kk));