1 /*******************************************************************************
2 Copyright (c) 2017, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 /* This will shuffle the elements in 'in' vector as (mask needed :: 01 00 11 10)
35 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
36 FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
41 FLOAT tp0, tp1, f0, f1;
42 v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
43 v2f64 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15;
44 v2f64 da_i_vec, da_i_vec_neg, da_r_vec;
50 if ((0.0 == da_r) && (0.0 == da_i))
52 v2f64 zero_v = __msa_cast_to_vector_double(0);
53 zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
54 zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
56 for (i = (n >> 4); i--;)
58 ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
59 zero_v, zero_v, x, 2);
60 ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
61 zero_v, zero_v, x, 2);
68 ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
69 zero_v, zero_v, x, 2);
74 ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, 2);
79 ST_DP2_INC(zero_v, zero_v, x, 2);
90 da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
91 da_i_vec_neg = -da_i_vec;
92 da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
99 pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
102 pref_offset = L1_DATA_LINESIZE - pref_offset;
103 pref_offset = pref_offset / sizeof(FLOAT);
105 x_pref = x + pref_offset + 32 + 16;
107 LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
108 for (i = (n >> 4)- 1; i--;)
110 PREF_OFFSET(x_pref, 0);
111 PREF_OFFSET(x_pref, 32);
112 PREF_OFFSET(x_pref, 64);
113 PREF_OFFSET(x_pref, 96);
114 PREF_OFFSET(x_pref, 128);
115 PREF_OFFSET(x_pref, 160);
116 PREF_OFFSET(x_pref, 192);
117 PREF_OFFSET(x_pref, 224);
120 x8 = LD_DP(px); px += 2;
122 x9 = LD_DP(px); px += 2;
124 x10 = LD_DP(px); px += 2;
126 x11 = LD_DP(px); px += 2;
128 x12 = LD_DP(px); px += 2;
130 x13 = LD_DP(px); px += 2;
132 x0 = (v2f64) __msa_shf_w((v4i32) x0, SHF_78);
133 x14 = LD_DP(px); px += 2;
135 x1 = (v2f64) __msa_shf_w((v4i32) x1, SHF_78);
136 x15 = LD_DP(px); px += 2;
138 x2 = (v2f64) __msa_shf_w((v4i32) x2, SHF_78);
140 x3 = (v2f64) __msa_shf_w((v4i32) x3, SHF_78);
141 ST_DP(x0, x); x += 2;
143 x4 = (v2f64) __msa_shf_w((v4i32) x4, SHF_78);
144 ST_DP(x1, x); x += 2;
146 x5 = (v2f64) __msa_shf_w((v4i32) x5, SHF_78);
147 ST_DP(x2, x); x += 2;
149 x6 = (v2f64) __msa_shf_w((v4i32) x6, SHF_78);
150 ST_DP(x3, x); x += 2;
152 x7 = (v2f64) __msa_shf_w((v4i32) x7, SHF_78);
153 ST_DP(x4, x); x += 2;
155 x8 = (v2f64) __msa_shf_w((v4i32) x8, SHF_78);
156 ST_DP(x5, x); x += 2;
158 x9 = (v2f64) __msa_shf_w((v4i32) x9, SHF_78);
159 ST_DP(x6, x); x += 2;
161 x10 = (v2f64) __msa_shf_w((v4i32) x10, SHF_78);
162 ST_DP(x7, x); x += 2;
163 x11 = (v2f64) __msa_shf_w((v4i32) x11, SHF_78);
164 ST_DP(x8, x); x += 2;
165 x0 = LD_DP(px); px += 2;
166 x12 = (v2f64) __msa_shf_w((v4i32) x12, SHF_78);
167 ST_DP(x9, x); x += 2;
168 x1 = LD_DP(px); px += 2;
169 x13 = (v2f64) __msa_shf_w((v4i32) x13, SHF_78);
170 ST_DP(x10, x); x += 2;
171 x2 = LD_DP(px); px += 2;
172 x14 = (v2f64) __msa_shf_w((v4i32) x14, SHF_78);
173 ST_DP(x11, x); x += 2;
174 x3 = LD_DP(px); px += 2;
175 x15 = (v2f64) __msa_shf_w((v4i32) x15, SHF_78);
176 ST_DP(x12, x); x += 2;
177 x4 = LD_DP(px); px += 2;
178 ST_DP(x13, x); x += 2;
179 x5 = LD_DP(px); px += 2;
180 ST_DP(x14, x); x += 2;
181 x6 = LD_DP(px); px += 2;
182 ST_DP(x15, x); x += 2;
183 x7 = LD_DP(px); px += 2;
186 LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15);
187 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
189 MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
191 MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
193 MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
195 SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
196 SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
197 SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78);
198 SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78);
199 ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
200 x12, x13, x14, x15, x, 2);
207 LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
208 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
210 MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
212 SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
213 SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
214 ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
219 LD_DP4_INC(px, 2, x0, x1, x2, x3);
220 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
222 SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
223 ST_DP4_INC(x0, x1, x2, x3, x, 2);
228 LD_DP2_INC(px, 2, x0, x1);
229 MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1);
230 SHF_W2_DP(x0, x1, x0, x1, SHF_78);
231 ST_DP2_INC(x0, x1, x, 2);
236 LD_GP2_INC(px, 1, f0, f1);
237 MUL2(f0, da_i, f1, -da_i, f0, f1);
238 ST_GP2_INC(f1, f0, x, 1);
242 else if (0.0 == da_i)
244 da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
249 BLASLONG pref_offset;
251 pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
254 pref_offset = L1_DATA_LINESIZE - pref_offset;
255 pref_offset = pref_offset / sizeof(FLOAT);
257 x_pref = x + pref_offset + 32 + 16;
259 LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
260 for (i = (n >> 4)- 1; i--;)
262 PREF_OFFSET(x_pref, 0);
263 PREF_OFFSET(x_pref, 32);
264 PREF_OFFSET(x_pref, 64);
265 PREF_OFFSET(x_pref, 96);
266 PREF_OFFSET(x_pref, 128);
267 PREF_OFFSET(x_pref, 160);
268 PREF_OFFSET(x_pref, 192);
269 PREF_OFFSET(x_pref, 224);
272 x8 = LD_DP(px); px += 2;
274 x9 = LD_DP(px); px += 2;
276 x10 = LD_DP(px); px += 2;
278 x11 = LD_DP(px); px += 2;
280 x12 = LD_DP(px); px += 2;
282 x13 = LD_DP(px); px += 2;
284 ST_DP(x0, x); x += 2;
285 x14 = LD_DP(px); px += 2;
287 ST_DP(x1, x); x += 2;
288 x15 = LD_DP(px); px += 2;
290 ST_DP(x2, x); x += 2;
292 ST_DP(x3, x); x += 2;
294 ST_DP(x4, x); x += 2;
296 ST_DP(x5, x); x += 2;
298 ST_DP(x6, x); x += 2;
300 ST_DP(x7, x); x += 2;
302 ST_DP(x8, x); x += 2;
303 x0 = LD_DP(px); px += 2;
305 ST_DP(x9, x); x += 2;
306 x1 = LD_DP(px); px += 2;
308 ST_DP(x10, x); x += 2;
309 x2 = LD_DP(px); px += 2;
310 ST_DP(x11, x); x += 2;
311 x3 = LD_DP(px); px += 2;
312 ST_DP(x12, x); x += 2;
313 x4 = LD_DP(px); px += 2;
314 ST_DP(x13, x); x += 2;
315 x5 = LD_DP(px); px += 2;
316 ST_DP(x14, x); x += 2;
317 x6 = LD_DP(px); px += 2;
318 ST_DP(x15, x); x += 2;
319 x7 = LD_DP(px); px += 2;
322 LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15);
323 MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
325 MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
327 MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec,
329 MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec,
331 ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
332 x12, x13, x14, x15, x, 2);
339 LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
340 MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
342 MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
344 ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
349 LD_DP4_INC(px, 2, x0, x1, x2, x3);
350 MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
352 ST_DP4_INC(x0, x1, x2, x3, x, 2);
357 LD_DP2_INC(px, 2, x0, x1);
358 MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1);
359 ST_DP2_INC(x0, x1, x, 2);
364 LD_GP2_INC(px, 1, f0, f1);
365 MUL2(f0, da_r, f1, da_r, f0, f1);
366 ST_GP2_INC(f0, f1, x, 1);
373 BLASLONG pref_offset;
375 pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
378 pref_offset = L1_DATA_LINESIZE - pref_offset;
379 pref_offset = pref_offset / sizeof(FLOAT);
381 x_pref = x + pref_offset + 32;
383 da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
384 da_i_vec_neg = -da_i_vec;
385 da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
387 da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
389 for (i = (n >> 4); i--;)
391 PREF_OFFSET(x_pref, 0);
392 PREF_OFFSET(x_pref, 32);
393 PREF_OFFSET(x_pref, 64);
394 PREF_OFFSET(x_pref, 96);
395 PREF_OFFSET(x_pref, 128);
396 PREF_OFFSET(x_pref, 160);
397 PREF_OFFSET(x_pref, 192);
398 PREF_OFFSET(x_pref, 224);
401 LD_DP16_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10,
402 x11, x12, x13, x14, x15);
403 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
405 MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
407 MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
409 MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
411 SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
412 SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
413 SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78);
414 SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78);
415 FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
416 FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
417 FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11);
418 FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15);
419 ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11,
420 d12, d13, d14, d15, x, 2);
427 LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
428 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
430 MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
432 SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
433 SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
434 FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
435 FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
436 ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, 2);
441 LD_DP4_INC(px, 2, x0, x1, x2, x3);
442 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
444 SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
445 FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
446 ST_DP4_INC(d0, d1, d2, d3, x, 2);
451 LD_DP2_INC(px, 2, x0, x1);
452 MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1);
453 SHF_W2_DP(d0, d1, d0, d1, SHF_78);
454 FMADD2(x0, x1, da_r_vec, d0, d1);
455 ST_DP2_INC(d0, d1, x, 2);
460 LD_GP2_INC(px, 1, f0, f1);
467 ST_GP2_INC(tp0, tp1, x, 1);
476 if ((0.0 == da_r) && (0.0 == da_i))
478 v2f64 zero_v = __msa_cast_to_vector_double(0);
479 zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
480 zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
482 for (i = (n >> 4); i--;)
484 ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
485 zero_v, zero_v, x, inc_x2);
486 ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
487 zero_v, zero_v, x, inc_x2);
494 ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
495 zero_v, zero_v, x, inc_x2);
500 ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, inc_x2);
505 ST_DP2_INC(zero_v, zero_v, x, inc_x2);
514 else if (0.0 == da_r)
516 da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
517 da_i_vec_neg = -da_i_vec;
518 da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
520 for (i = (n >> 4); i--;)
522 LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
523 x10, x11, x12, x13, x14, x15);
524 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
526 MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
528 MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
530 MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
532 SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
533 SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
534 SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78);
535 SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78);
536 ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
537 x12, x13, x14, x15, x, inc_x2);
544 LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
545 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
547 MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
549 SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
550 SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
551 ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2);
556 LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
557 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
559 SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
560 ST_DP4_INC(x0, x1, x2, x3, x, inc_x2);
565 LD_DP2_INC(px, inc_x2, x0, x1);
566 MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1);
567 SHF_W2_DP(x0, x1, x0, x1, SHF_78);
568 ST_DP2_INC(x0, x1, x, inc_x2);
573 LD_GP2_INC(px, 1, f0, f1);
574 MUL2(f0, da_i, f1, -da_i, f0, f1);
575 ST_GP2_INC(f1, f0, x, 1);
579 else if (0.0 == da_i)
581 da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
583 for (i = (n >> 4); i--;)
585 LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
586 x10, x11, x12, x13, x14, x15);
587 MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
589 MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
591 MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec,
593 MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec,
595 ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
596 x12, x13, x14, x15, x, inc_x2);
603 LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
604 MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
606 MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
608 ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2);
613 LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
614 MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
616 ST_DP4_INC(x0, x1, x2, x3, x, inc_x2);
621 LD_DP2_INC(px, inc_x2, x0, x1);
622 MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1);
623 ST_DP2_INC(x0, x1, x, inc_x2);
628 LD_GP2_INC(px, 1, f0, f1);
629 MUL2(f0, da_r, f1, da_r, f0, f1);
630 ST_GP2_INC(f0, f1, x, 1);
636 da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
637 da_i_vec_neg = -da_i_vec;
638 da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
640 da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
642 for (i = (n >> 4); i--;)
644 LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
645 x10, x11, x12, x13, x14, x15);
646 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
648 MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
650 MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
652 MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
654 SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
655 SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
656 SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78);
657 SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78);
658 FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
659 FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
660 FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11);
661 FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15);
662 ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11,
663 d12, d13, d14, d15, x, inc_x2);
670 LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
671 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
673 MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
675 SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
676 SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
677 FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
678 FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
679 ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, inc_x2);
684 LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
685 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
687 SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
688 FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
689 ST_DP4_INC(d0, d1, d2, d3, x, inc_x2);
694 LD_DP2_INC(px, inc_x2, x0, x1);
695 MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1);
696 SHF_W2_DP(d0, d1, d0, d1, SHF_78);
697 FMADD2(x0, x1, da_r_vec, d0, d1);
698 ST_DP2_INC(d0, d1, x, inc_x2);
703 LD_GP2_INC(px, 1, f0, f1);
710 ST_GP2_INC(tp0, tp1, x, 1);