1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 static __attribute__ ((noinline))
32 void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
34 v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
35 v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
36 v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
37 v2f64 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15;
38 v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
39 v2f64 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18;
40 v2f64 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28;
41 v2f64 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39;
42 v2f64 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63;
43 FLOAT *c_nxt1line = c + ldc;
44 FLOAT *c_nxt2line = c + 2 * ldc;
45 FLOAT *c_nxt3line = c + 3 * ldc;
62 LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
63 LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7);
64 LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
65 LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15);
69 BLASLONG i, pref_offset;
71 v2f64 src_b, src_b0, src_b1;
73 pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1);
77 pref_offset = L1_DATA_LINESIZE - pref_offset;
78 pref_offset = pref_offset / sizeof(FLOAT);
81 pa0_pref = a + pref_offset;
83 for (i = (bk >> 1); i--;)
85 PREF_OFFSET(pa0_pref, 128);
86 PREF_OFFSET(pa0_pref, 160);
87 PREF_OFFSET(pa0_pref, 192);
88 PREF_OFFSET(pa0_pref, 224);
90 LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3);
91 LD_DP2_INC(b, 2, src_b0, src_b1);
93 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
94 src_c0 -= src_a0 * src_b;
95 src_c1 -= src_a1 * src_b;
96 src_c2 -= src_a2 * src_b;
97 src_c3 -= src_a3 * src_b;
99 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
100 src_c4 -= src_a0 * src_b;
101 src_c5 -= src_a1 * src_b;
102 src_c6 -= src_a2 * src_b;
103 src_c7 -= src_a3 * src_b;
105 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
106 src_c8 -= src_a0 * src_b;
107 src_c9 -= src_a1 * src_b;
108 src_c10 -= src_a2 * src_b;
109 src_c11 -= src_a3 * src_b;
111 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
112 src_c12 -= src_a0 * src_b;
113 src_c13 -= src_a1 * src_b;
114 src_c14 -= src_a2 * src_b;
115 src_c15 -= src_a3 * src_b;
117 LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3);
118 LD_DP2_INC(b, 2, src_b0, src_b1);
120 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
121 src_c0 -= src_a0 * src_b;
122 src_c1 -= src_a1 * src_b;
123 src_c2 -= src_a2 * src_b;
124 src_c3 -= src_a3 * src_b;
126 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
127 src_c4 -= src_a0 * src_b;
128 src_c5 -= src_a1 * src_b;
129 src_c6 -= src_a2 * src_b;
130 src_c7 -= src_a3 * src_b;
132 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
133 src_c8 -= src_a0 * src_b;
134 src_c9 -= src_a1 * src_b;
135 src_c10 -= src_a2 * src_b;
136 src_c11 -= src_a3 * src_b;
138 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
139 src_c12 -= src_a0 * src_b;
140 src_c13 -= src_a1 * src_b;
141 src_c14 -= src_a2 * src_b;
142 src_c15 -= src_a3 * src_b;
149 LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3);
150 LD_DP2_INC(b, 2, src_b0, src_b1);
152 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
153 src_c0 -= src_a0 * src_b;
154 src_c1 -= src_a1 * src_b;
155 src_c2 -= src_a2 * src_b;
156 src_c3 -= src_a3 * src_b;
158 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
159 src_c4 -= src_a0 * src_b;
160 src_c5 -= src_a1 * src_b;
161 src_c6 -= src_a2 * src_b;
162 src_c7 -= src_a3 * src_b;
164 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
165 src_c8 -= src_a0 * src_b;
166 src_c9 -= src_a1 * src_b;
167 src_c10 -= src_a2 * src_b;
168 src_c11 -= src_a3 * src_b;
170 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
171 src_c12 -= src_a0 * src_b;
172 src_c13 -= src_a1 * src_b;
173 src_c14 -= src_a2 * src_b;
174 src_c15 -= src_a3 * src_b;
178 ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1);
179 ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3);
180 ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5);
181 ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7);
182 ILVRL_D2_DP(src_c12, src_c8, res_c8, res_c9);
183 ILVRL_D2_DP(src_c13, src_c9, res_c10, res_c11);
184 ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13);
185 ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15);
187 src_a0 = LD_DP(a + 0);
188 src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1);
189 src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
190 src_a2 = LD_DP(a + 2);
191 src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1);
192 src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0);
193 src_a4 = LD_DP(a + 4);
194 src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
195 src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0);
196 src_a6 = LD_DP(a + 6);
197 src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
198 src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
201 res_c1 -= res_c0 * src_a1;
202 res_c2 -= res_c0 * src_a2;
203 res_c3 -= res_c0 * src_a3;
204 res_c4 -= res_c0 * src_a4;
205 res_c5 -= res_c0 * src_a5;
206 res_c6 -= res_c0 * src_a6;
207 res_c7 -= res_c0 * src_a7;
210 res_c9 -= res_c8 * src_a1;
211 res_c10 -= res_c8 * src_a2;
212 res_c11 -= res_c8 * src_a3;
213 res_c12 -= res_c8 * src_a4;
214 res_c13 -= res_c8 * src_a5;
215 res_c14 -= res_c8 * src_a6;
216 res_c15 -= res_c8 * src_a7;
218 src_a9 = __msa_cast_to_vector_double(*(a + 9));
219 src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
220 src_a10 = LD_DP(a + 10);
221 src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
222 src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
223 src_a12 = LD_DP(a + 12);
224 src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1);
225 src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0);
226 src_a14 = LD_DP(a + 14);
227 src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1);
228 src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0);
231 res_c2 -= res_c1 * src_a10;
232 res_c3 -= res_c1 * src_a11;
233 res_c4 -= res_c1 * src_a12;
234 res_c5 -= res_c1 * src_a13;
235 res_c6 -= res_c1 * src_a14;
236 res_c7 -= res_c1 * src_a15;
239 res_c10 -= res_c9 * src_a10;
240 res_c11 -= res_c9 * src_a11;
241 res_c12 -= res_c9 * src_a12;
242 res_c13 -= res_c9 * src_a13;
243 res_c14 -= res_c9 * src_a14;
244 res_c15 -= res_c9 * src_a15;
246 ST_DP(res_c0, b + 0);
247 ST_DP(res_c8, b + 2);
248 ST_DP(res_c1, b + 4);
249 ST_DP(res_c9, b + 6);
251 ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4);
252 ILVRL_D2_DP(res_c9, res_c8, src_c8, src_c12);
255 ST_DP(src_c4, c_nxt1line);
256 ST_DP(src_c8, c_nxt2line);
257 ST_DP(src_c12, c_nxt3line);
259 src_a18 = LD_DP(a + 18);
260 src_a19 = (v2f64) __msa_splati_d((v2i64) src_a18, 1);
261 src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
262 src_a20 = LD_DP(a + 20);
263 src_a21 = (v2f64) __msa_splati_d((v2i64) src_a20, 1);
264 src_a20 = (v2f64) __msa_splati_d((v2i64) src_a20, 0);
265 src_a22 = LD_DP(a + 22);
266 src_a23 = (v2f64) __msa_splati_d((v2i64) src_a22, 1);
267 src_a22 = (v2f64) __msa_splati_d((v2i64) src_a22, 0);
270 res_c3 -= res_c2 * src_a19;
271 res_c4 -= res_c2 * src_a20;
272 res_c5 -= res_c2 * src_a21;
273 res_c6 -= res_c2 * src_a22;
274 res_c7 -= res_c2 * src_a23;
277 res_c11 -= res_c10 * src_a19;
278 res_c12 -= res_c10 * src_a20;
279 res_c13 -= res_c10 * src_a21;
280 res_c14 -= res_c10 * src_a22;
281 res_c15 -= res_c10 * src_a23;
283 src_a27 = __msa_cast_to_vector_double(*(a + 27));
284 src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0);
285 src_a28 = LD_DP(a + 28);
286 src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1);
287 src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0);
288 src_a30 = LD_DP(a + 30);
289 src_a31 = (v2f64) __msa_splati_d((v2i64) src_a30, 1);
290 src_a30 = (v2f64) __msa_splati_d((v2i64) src_a30, 0);
293 res_c4 -= res_c3 * src_a28;
294 res_c5 -= res_c3 * src_a29;
295 res_c6 -= res_c3 * src_a30;
296 res_c7 -= res_c3 * src_a31;
299 res_c12 -= res_c11 * src_a28;
300 res_c13 -= res_c11 * src_a29;
301 res_c14 -= res_c11 * src_a30;
302 res_c15 -= res_c11 * src_a31;
304 ST_DP(res_c2, b + 8);
305 ST_DP(res_c10, b + 10);
306 ST_DP(res_c3, b + 12);
307 ST_DP(res_c11, b + 14);
309 ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5);
310 ILVRL_D2_DP(res_c11, res_c10, src_c9, src_c13);
312 src_a36 = LD_DP(a + 36);
313 src_a37 = (v2f64) __msa_splati_d((v2i64) src_a36, 1);
314 src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
315 src_a38 = LD_DP(a + 38);
316 src_a39 = (v2f64) __msa_splati_d((v2i64) src_a38, 1);
317 src_a38 = (v2f64) __msa_splati_d((v2i64) src_a38, 0);
320 res_c5 -= res_c4 * src_a37;
321 res_c6 -= res_c4 * src_a38;
322 res_c7 -= res_c4 * src_a39;
325 res_c13 -= res_c12 * src_a37;
326 res_c14 -= res_c12 * src_a38;
327 res_c15 -= res_c12 * src_a39;
329 src_a45 = __msa_cast_to_vector_double(*(a + 45));
330 src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0);
331 src_a46 = LD_DP(a + 46);
332 src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1);
333 src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0);
336 res_c6 -= res_c5 * src_a46;
337 res_c7 -= res_c5 * src_a47;
340 res_c14 -= res_c13 * src_a46;
341 res_c15 -= res_c13 * src_a47;
343 ST_DP(src_c1, c + 2);
344 ST_DP(src_c5, c_nxt1line + 2);
345 ST_DP(src_c9, c_nxt2line + 2);
346 ST_DP(src_c13, c_nxt3line + 2);
348 ST_DP(res_c4, b + 16);
349 ST_DP(res_c12, b + 18);
350 ST_DP(res_c5, b + 20);
351 ST_DP(res_c13, b + 22);
353 ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
354 ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14);
356 src_a63 = __msa_cast_to_vector_double(*(a + 63));
357 src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0);
358 src_a54 = LD_DP(a + 54);
359 src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1);
360 src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
363 res_c7 -= res_c6 * src_a55;
366 res_c15 -= res_c14 * src_a55;
371 ST_DP(src_c2, c + 4);
372 ST_DP(src_c6, c_nxt1line + 4);
373 ST_DP(src_c10, c_nxt2line + 4);
374 ST_DP(src_c14, c_nxt3line + 4);
376 ST_DP(res_c6, b + 24);
377 ST_DP(res_c14, b + 26);
378 ST_DP(res_c7, b + 28);
379 ST_DP(res_c15, b + 30);
381 ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7);
382 ILVRL_D2_DP(res_c15, res_c14, src_c11, src_c15);
384 ST_DP(src_c3, c + 6);
385 ST_DP(src_c7, c_nxt1line + 6);
386 ST_DP(src_c11, c_nxt2line + 6);
387 ST_DP(src_c15, c_nxt3line + 6);
390 static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
392 v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
393 v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
394 v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
395 v2f64 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18;
396 v2f64 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28;
397 v2f64 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39;
398 v2f64 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63;
400 LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
401 LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7);
406 v2f64 src_b, src_b0, src_b1;
408 LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
414 for (i = (bk - 1); i--;)
416 LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7);
419 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
420 src_c0 -= src_a0 * src_b;
421 src_c1 -= src_a1 * src_b;
422 src_c2 -= src_a2 * src_b;
423 src_c3 -= src_a3 * src_b;
425 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
426 src_c4 -= src_a0 * src_b;
427 src_c5 -= src_a1 * src_b;
428 src_c6 -= src_a2 * src_b;
429 src_c7 -= src_a3 * src_b;
441 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
442 src_c0 -= src_a0 * src_b;
443 src_c1 -= src_a1 * src_b;
444 src_c2 -= src_a2 * src_b;
445 src_c3 -= src_a3 * src_b;
447 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
448 src_c4 -= src_a0 * src_b;
449 src_c5 -= src_a1 * src_b;
450 src_c6 -= src_a2 * src_b;
451 src_c7 -= src_a3 * src_b;
454 ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1);
455 ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3);
456 ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5);
457 ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7);
459 src_a0 = LD_DP(a + 0);
460 src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1);
461 src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
462 src_a2 = LD_DP(a + 2);
463 src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1);
464 src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0);
465 src_a4 = LD_DP(a + 4);
466 src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
467 src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0);
468 src_a6 = LD_DP(a + 6);
469 src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
470 src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
473 res_c1 -= res_c0 * src_a1;
474 res_c2 -= res_c0 * src_a2;
475 res_c3 -= res_c0 * src_a3;
476 res_c4 -= res_c0 * src_a4;
477 res_c5 -= res_c0 * src_a5;
478 res_c6 -= res_c0 * src_a6;
479 res_c7 -= res_c0 * src_a7;
481 src_a9 = __msa_cast_to_vector_double(*(a + 9));
482 src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
483 src_a10 = LD_DP(a + 10);
484 src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
485 src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
486 src_a12 = LD_DP(a + 12);
487 src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1);
488 src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0);
489 src_a14 = LD_DP(a + 14);
490 src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1);
491 src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0);
494 res_c2 -= res_c1 * src_a10;
495 res_c3 -= res_c1 * src_a11;
496 res_c4 -= res_c1 * src_a12;
497 res_c5 -= res_c1 * src_a13;
498 res_c6 -= res_c1 * src_a14;
499 res_c7 -= res_c1 * src_a15;
501 src_a18 = LD_DP(a + 18);
502 src_a19 = (v2f64) __msa_splati_d((v2i64) src_a18, 1);
503 src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
504 src_a20 = LD_DP(a + 20);
505 src_a21 = (v2f64) __msa_splati_d((v2i64) src_a20, 1);
506 src_a20 = (v2f64) __msa_splati_d((v2i64) src_a20, 0);
507 src_a22 = LD_DP(a + 22);
508 src_a23 = (v2f64) __msa_splati_d((v2i64) src_a22, 1);
509 src_a22 = (v2f64) __msa_splati_d((v2i64) src_a22, 0);
512 res_c3 -= res_c2 * src_a19;
513 res_c4 -= res_c2 * src_a20;
514 res_c5 -= res_c2 * src_a21;
515 res_c6 -= res_c2 * src_a22;
516 res_c7 -= res_c2 * src_a23;
518 src_a27 = __msa_cast_to_vector_double(*(a + 27));
519 src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0);
520 src_a28 = LD_DP(a + 28);
521 src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1);
522 src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0);
523 src_a30 = LD_DP(a + 30);
524 src_a31 = (v2f64) __msa_splati_d((v2i64) src_a30, 1);
525 src_a30 = (v2f64) __msa_splati_d((v2i64) src_a30, 0);
528 res_c4 -= res_c3 * src_a28;
529 res_c5 -= res_c3 * src_a29;
530 res_c6 -= res_c3 * src_a30;
531 res_c7 -= res_c3 * src_a31;
533 ST_DP(res_c0, b + 0);
534 ST_DP(res_c1, b + 2);
535 ST_DP(res_c2, b + 4);
536 ST_DP(res_c3, b + 6);
538 ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4);
539 ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5);
541 ST_DP2(src_c0, src_c1, c, 2);
542 ST_DP2(src_c4, src_c5, c + ldc, 2);
544 src_a36 = LD_DP(a + 36);
545 src_a37 = (v2f64) __msa_splati_d((v2i64) src_a36, 1);
546 src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
547 src_a38 = LD_DP(a + 38);
548 src_a39 = (v2f64) __msa_splati_d((v2i64) src_a38, 1);
549 src_a38 = (v2f64) __msa_splati_d((v2i64) src_a38, 0);
552 res_c5 -= res_c4 * src_a37;
553 res_c6 -= res_c4 * src_a38;
554 res_c7 -= res_c4 * src_a39;
556 src_a45 = __msa_cast_to_vector_double(*(a + 45));
557 src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0);
558 src_a46 = LD_DP(a + 46);
559 src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1);
560 src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0);
563 res_c6 -= res_c5 * src_a46;
564 res_c7 -= res_c5 * src_a47;
566 src_a63 = __msa_cast_to_vector_double(*(a + 63));
567 src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0);
568 src_a54 = LD_DP(a + 54);
569 src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1);
570 src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
573 res_c7 -= res_c6 * src_a55;
577 ST_DP(res_c4, b + 8);
578 ST_DP(res_c5, b + 10);
579 ST_DP(res_c6, b + 12);
580 ST_DP(res_c7, b + 14);
582 ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
583 ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7);
585 ST_DP2(src_c2, src_c3, c + 4, 2);
586 ST_DP2(src_c6, src_c7, c + 4 + ldc, 2);
589 static void dsolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
591 FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18;
592 FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39;
593 FLOAT a45, a46, a47, a54, a55, a63, c0, c1, c2, c3, c4, c5, c6, c7;
724 static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
726 v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
727 v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
728 v2f64 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7;
729 v2f64 src_a10, src_a11, src_a15;
731 LD_DP2(c, 2, src_c0, src_c1);
732 LD_DP2(c + ldc, 2, src_c2, src_c3);
733 LD_DP2(c + 2 * ldc, 2, src_c4, src_c5);
734 LD_DP2(c + 3 * ldc, 2, src_c6, src_c7);
739 v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
743 LD_DP2(a, 2, src_a0, src_a1);
744 LD_DP2(b, 2, src_b0, src_b1);
746 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
747 src_c0 -= src_a0 * src_b;
748 src_c1 -= src_a1 * src_b;
750 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
751 src_c2 -= src_a0 * src_b;
752 src_c3 -= src_a1 * src_b;
754 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
755 src_c4 -= src_a0 * src_b;
756 src_c5 -= src_a1 * src_b;
758 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
759 src_c6 -= src_a0 * src_b;
760 src_c7 -= src_a1 * src_b;
767 ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1);
768 ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3);
769 ILVRL_D2_DP(src_c6, src_c4, res_c4, res_c5);
770 ILVRL_D2_DP(src_c7, src_c5, res_c6, res_c7);
772 src_a0 = LD_DP(a + 0);
773 src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1);
774 src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
775 src_a2 = LD_DP(a + 2);
776 src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1);
777 src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0);
780 res_c1 -= res_c0 * src_a1;
781 res_c2 -= res_c0 * src_a2;
782 res_c3 -= res_c0 * src_a3;
785 res_c5 -= res_c4 * src_a1;
786 res_c6 -= res_c4 * src_a2;
787 res_c7 -= res_c4 * src_a3;
789 src_a5 = __msa_cast_to_vector_double(*(a + 5));
790 src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0);
791 src_a6 = LD_DP(a + 6);
792 src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
793 src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
796 res_c2 -= res_c1 * src_a6;
797 res_c3 -= res_c1 * src_a7;
800 res_c6 -= res_c5 * src_a6;
801 res_c7 -= res_c5 * src_a7;
803 src_a10 = LD_DP(a + 10);
804 src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
805 src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
806 src_a15 = __msa_cast_to_vector_double(*(a + 15));
807 src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0);
810 res_c3 -= res_c2 * src_a11;
814 res_c7 -= res_c6 * src_a11;
817 ST_DP(res_c0, b + 0);
818 ST_DP(res_c4, b + 2);
819 ST_DP(res_c1, b + 4);
820 ST_DP(res_c5, b + 6);
821 ST_DP(res_c2, b + 8);
822 ST_DP(res_c6, b + 10);
823 ST_DP(res_c3, b + 12);
824 ST_DP(res_c7, b + 14);
826 ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2);
827 ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3);
828 ILVRL_D2_DP(res_c5, res_c4, src_c4, src_c6);
829 ILVRL_D2_DP(res_c7, res_c6, src_c5, src_c7);
831 ST_DP2(src_c0, src_c1, c, 2);
832 ST_DP2(src_c2, src_c3, c + ldc, 2);
833 ST_DP2(src_c4, src_c5, c + 2 * ldc, 2);
834 ST_DP2(src_c6, src_c7, c + 3 * ldc, 2);
837 static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
839 v2f64 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3;
840 v2f64 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7;
841 v2f64 src_a10, src_a11, src_a15;
843 LD_DP2(c, 2, src_c0, src_c1);
844 LD_DP2(c + ldc, 2, src_c2, src_c3);
849 v2f64 src_a0, src_a1, src_b, src_b0;
853 LD_DP2(a, 2, src_a0, src_a1);
856 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
857 src_c0 -= src_a0 * src_b;
858 src_c1 -= src_a1 * src_b;
860 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
861 src_c2 -= src_a0 * src_b;
862 src_c3 -= src_a1 * src_b;
869 ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1);
870 ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3);
872 src_a0 = LD_DP(a + 0);
873 src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1);
874 src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
875 src_a2 = LD_DP(a + 2);
876 src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1);
877 src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0);
880 res_c1 -= res_c0 * src_a1;
881 res_c2 -= res_c0 * src_a2;
882 res_c3 -= res_c0 * src_a3;
884 src_a5 = __msa_cast_to_vector_double(*(a + 5));
885 src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0);
886 src_a6 = LD_DP(a + 6);
887 src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
888 src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
891 res_c2 -= res_c1 * src_a6;
892 res_c3 -= res_c1 * src_a7;
894 src_a10 = LD_DP(a + 10);
895 src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
896 src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
897 src_a15 = __msa_cast_to_vector_double(*(a + 15));
898 src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0);
901 res_c3 -= res_c2 * src_a11;
904 ST_DP(res_c0, b + 0);
905 ST_DP(res_c1, b + 2);
906 ST_DP(res_c2, b + 4);
907 ST_DP(res_c3, b + 6);
909 ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2);
910 ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3);
912 ST_DP2(src_c0, src_c1, c, 2);
913 ST_DP2(src_c2, src_c3, c + ldc, 2);
916 static void dsolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
918 FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15, c0, c1, c2, c3;
977 static void dsolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
979 FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1;
980 FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
984 c0_nxt1 = *(c + ldc);
985 c1_nxt1 = *(c + 1 + ldc);
986 c0_nxt2 = *(c + 2 * ldc);
987 c1_nxt2 = *(c + 1 + 2 * ldc);
988 c0_nxt3 = *(c + 3 * ldc);
989 c1_nxt3 = *(c + 1 + 3 * ldc);
999 c0_nxt1 -= a[0] * b[1];
1000 c1_nxt1 -= a[1] * b[1];
1001 c0_nxt2 -= a[0] * b[2];
1002 c1_nxt2 -= a[1] * b[2];
1003 c0_nxt3 -= a[0] * b[3];
1004 c1_nxt3 -= a[1] * b[3];
1020 c1_nxt1 -= c0_nxt1 * a1;
1024 c1_nxt2 -= c0_nxt2 * a1;
1028 c1_nxt3 -= c0_nxt3 * a1;
1042 *(c + 0 + ldc) = c0_nxt1;
1043 *(c + 1 + ldc) = c1_nxt1;
1044 *(c + 0 + 2 * ldc) = c0_nxt2;
1045 *(c + 1 + 2 * ldc) = c1_nxt2;
1046 *(c + 0 + 3 * ldc) = c0_nxt3;
1047 *(c + 1 + 3 * ldc) = c1_nxt3;
1050 static void dsolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1052 FLOAT a0, a1, a3, c0, c1, c0_nxt, c1_nxt;
1057 c0_nxt = *(c + ldc);
1058 c1_nxt = *(c + 1 + ldc);
1069 c0_nxt -= a[0] * b[1];
1070 c1_nxt -= a[1] * b[1];
1086 c1_nxt -= c0_nxt * a1;
1097 *(c + 0 + ldc) = c0_nxt;
1098 *(c + 1 + ldc) = c1_nxt;
1101 static void dsolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
1103 FLOAT a0, a1, a3, c0, c1;
1137 static void dsolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1139 FLOAT c0, c1, c2, c3;
1142 c1 = *(c + 1 * ldc);
1143 c2 = *(c + 2 * ldc);
1144 c3 = *(c + 3 * ldc);
1167 *(c + 0 * ldc) = c0;
1168 *(c + 1 * ldc) = c1;
1169 *(c + 2 * ldc) = c2;
1170 *(c + 3 * ldc) = c3;
1178 static void dsolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
1209 static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
1228 int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
1229 FLOAT *c, BLASLONG ldc, BLASLONG offset)
1234 for (j = (n >> 2); j--;)
1240 for (i = (m >> 3); i--;)
1242 dsolve_8x4_lt_msa(aa, b, cc, ldc, kk);
1253 dsolve_4x4_lt_msa(aa, b, cc, ldc, kk);
1262 dsolve_2x4_lt_msa(aa, b, cc, ldc, kk);
1271 dsolve_1x4_lt_msa(aa, b, cc, ldc, kk);
1291 for (i = (m >> 3); i--;)
1293 dsolve_8x2_lt_msa(aa, b, cc, ldc, kk);
1304 dsolve_4x2_lt_msa(aa, b, cc, ldc, kk);
1313 dsolve_2x2_lt_msa(aa, b, cc, ldc, kk);
1322 dsolve_1x2_lt_msa(aa, b, cc, ldc, kk);
1340 for (i = (m >> 3); i--;)
1342 dsolve_8x1_lt_msa(aa, b, cc, kk);
1353 dsolve_4x1_lt_msa(aa, b, cc, kk);
1362 dsolve_2x1_lt_msa(aa, b, cc, kk);
1371 dgmm_dsolve_1x1_msa(aa, b, cc, kk);