1 /*******************************************************************************
2 Copyright (c) 2017, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 /* This will shuffle the elements in 'in' vector as (mask needed :: 10 11 00 01)
35 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
36 FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
41 FLOAT tp0, tp1, tp2, tp3, f0, f1, f2, f3;
42 v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
43 v4f32 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15;
44 v4f32 da_i_vec, da_i_vec_neg, da_r_vec;
50 if ((0.0 == da_r) && (0.0 == da_i))
52 v4f32 zero_v = __msa_cast_to_vector_float(0);
53 zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0);
54 zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0);
55 zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0);
56 zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0);
58 for (i = (n >> 5); i--;)
60 ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
61 zero_v, zero_v, x, 4);
62 ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
63 zero_v, zero_v, x, 4);
70 ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
71 zero_v, zero_v, x, 4);
76 ST_SP4_INC(zero_v, zero_v, zero_v, zero_v, x, 4);
81 ST_SP2_INC(zero_v, zero_v, x, 4);
86 ST_SP(zero_v, x); x += 4;
98 da_i_vec = COPY_FLOAT_TO_VECTOR(da_i);
99 da_i_vec_neg = -da_i_vec;
100 da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec);
105 BLASLONG pref_offset;
107 pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
110 pref_offset = L1_DATA_LINESIZE - pref_offset;
111 pref_offset = pref_offset / sizeof(FLOAT);
113 x_pref = x + pref_offset + 64 + 32;
115 LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
116 for (i = (n >> 5)- 1; i--;)
118 PREF_OFFSET(x_pref, 0);
119 PREF_OFFSET(x_pref, 32);
120 PREF_OFFSET(x_pref, 64);
121 PREF_OFFSET(x_pref, 96);
122 PREF_OFFSET(x_pref, 128);
123 PREF_OFFSET(x_pref, 160);
124 PREF_OFFSET(x_pref, 192);
125 PREF_OFFSET(x_pref, 224);
128 x8 = LD_SP(px); px += 4;
130 x9 = LD_SP(px); px += 4;
132 x10 = LD_SP(px); px += 4;
134 x11 = LD_SP(px); px += 4;
136 x12 = LD_SP(px); px += 4;
138 x13 = LD_SP(px); px += 4;
140 x0 = (v4f32) __msa_shf_w((v4i32) x0, SHF_177);
141 x14 = LD_SP(px); px += 4;
143 x1 = (v4f32) __msa_shf_w((v4i32) x1, SHF_177);
144 x15 = LD_SP(px); px += 4;
146 x2 = (v4f32) __msa_shf_w((v4i32) x2, SHF_177);
148 x3 = (v4f32) __msa_shf_w((v4i32) x3, SHF_177);
149 ST_SP(x0, x); x += 4;
151 x4 = (v4f32) __msa_shf_w((v4i32) x4, SHF_177);
152 ST_SP(x1, x); x += 4;
154 x5 = (v4f32) __msa_shf_w((v4i32) x5, SHF_177);
155 ST_SP(x2, x); x += 4;
157 x6 = (v4f32) __msa_shf_w((v4i32) x6, SHF_177);
158 ST_SP(x3, x); x += 4;
160 x7 = (v4f32) __msa_shf_w((v4i32) x7, SHF_177);
161 ST_SP(x4, x); x += 4;
163 x8 = (v4f32) __msa_shf_w((v4i32) x8, SHF_177);
164 ST_SP(x5, x); x += 4;
166 x9 = (v4f32) __msa_shf_w((v4i32) x9, SHF_177);
167 ST_SP(x6, x); x += 4;
169 x10 = (v4f32) __msa_shf_w((v4i32) x10, SHF_177);
170 ST_SP(x7, x); x += 4;
171 x11 = (v4f32) __msa_shf_w((v4i32) x11, SHF_177);
172 ST_SP(x8, x); x += 4;
173 x0 = LD_SP(px); px += 4;
174 x12 = (v4f32) __msa_shf_w((v4i32) x12, SHF_177);
175 ST_SP(x9, x); x += 4;
176 x1 = LD_SP(px); px += 4;
177 x13 = (v4f32) __msa_shf_w((v4i32) x13, SHF_177);
178 ST_SP(x10, x); x += 4;
179 x2 = LD_SP(px); px += 4;
180 x14 = (v4f32) __msa_shf_w((v4i32) x14, SHF_177);
181 ST_SP(x11, x); x += 4;
182 x3 = LD_SP(px); px += 4;
183 x15 = (v4f32) __msa_shf_w((v4i32) x15, SHF_177);
184 ST_SP(x12, x); x += 4;
185 x4 = LD_SP(px); px += 4;
186 ST_SP(x13, x); x += 4;
187 x5 = LD_SP(px); px += 4;
188 ST_SP(x14, x); x += 4;
189 x6 = LD_SP(px); px += 4;
190 ST_SP(x15, x); x += 4;
191 x7 = LD_SP(px); px += 4;
194 LD_SP8_INC(px, 4, x8, x9, x10, x11, x12, x13, x14, x15);
195 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
197 MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
199 MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
201 MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
203 SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
204 SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177);
205 SHF_W4_SP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_177);
206 SHF_W4_SP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_177);
207 ST_SP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
208 x12, x13, x14, x15, x, 4);
215 LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
216 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
218 MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
220 SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
221 SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177);
222 ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
227 LD_SP4_INC(px, 4, x0, x1, x2, x3);
228 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
230 SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
231 ST_SP4_INC(x0, x1, x2, x3, x, 4);
236 LD_SP2_INC(px, 4, x0, x1);
237 MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1);
238 SHF_W2_SP(x0, x1, x0, x1, SHF_177);
239 ST_SP2_INC(x0, x1, x, 4);
244 LD_GP4_INC(px, 1, f0, f1, f2, f3);
245 MUL4(f0, da_i, f1, -da_i, f2, da_i, f3, -da_i,
247 ST_GP4_INC(f1, f0, f3, f2, x, 1);
252 LD_GP2_INC(px, 1, f0, f1);
253 MUL2(f0, da_i, f1, -da_i, f0, f1);
254 ST_GP2_INC(f1, f0, x, 1);
258 else if (0.0 == da_i)
260 da_r_vec = COPY_FLOAT_TO_VECTOR(da_r);
265 BLASLONG pref_offset;
267 pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
270 pref_offset = L1_DATA_LINESIZE - pref_offset;
271 pref_offset = pref_offset / sizeof(FLOAT);
273 x_pref = x + pref_offset + 64 + 32;
275 LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
276 for (i = (n >> 5)- 1; i--;)
278 PREF_OFFSET(x_pref, 0);
279 PREF_OFFSET(x_pref, 32);
280 PREF_OFFSET(x_pref, 64);
281 PREF_OFFSET(x_pref, 96);
282 PREF_OFFSET(x_pref, 128);
283 PREF_OFFSET(x_pref, 160);
284 PREF_OFFSET(x_pref, 192);
285 PREF_OFFSET(x_pref, 224);
288 x8 = LD_SP(px); px += 4;
290 x9 = LD_SP(px); px += 4;
292 x10 = LD_SP(px); px += 4;
294 x11 = LD_SP(px); px += 4;
296 x12 = LD_SP(px); px += 4;
298 x13 = LD_SP(px); px += 4;
300 ST_SP(x0, x); x += 4;
301 x14 = LD_SP(px); px += 4;
303 ST_SP(x1, x); x += 4;
304 x15 = LD_SP(px); px += 4;
306 ST_SP(x2, x); x += 4;
308 ST_SP(x3, x); x += 4;
310 ST_SP(x4, x); x += 4;
312 ST_SP(x5, x); x += 4;
314 ST_SP(x6, x); x += 4;
316 ST_SP(x7, x); x += 4;
318 ST_SP(x8, x); x += 4;
319 x0 = LD_SP(px); px += 4;
321 ST_SP(x9, x); x += 4;
322 x1 = LD_SP(px); px += 4;
324 ST_SP(x10, x); x += 4;
325 x2 = LD_SP(px); px += 4;
326 ST_SP(x11, x); x += 4;
327 x3 = LD_SP(px); px += 4;
328 ST_SP(x12, x); x += 4;
329 x4 = LD_SP(px); px += 4;
330 ST_SP(x13, x); x += 4;
331 x5 = LD_SP(px); px += 4;
332 ST_SP(x14, x); x += 4;
333 x6 = LD_SP(px); px += 4;
334 ST_SP(x15, x); x += 4;
335 x7 = LD_SP(px); px += 4;
338 LD_SP8_INC(px, 4, x8, x9, x10, x11, x12, x13, x14, x15);
339 MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
341 MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
343 MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec,
345 MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec,
347 ST_SP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
348 x12, x13, x14, x15, x, 4);
355 LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
356 MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
358 MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
360 ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
365 LD_SP4_INC(px, 4, x0, x1, x2, x3);
366 MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
368 ST_SP4_INC(x0, x1, x2, x3, x, 4);
373 LD_SP2_INC(px, 4, x0, x1);
374 MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1);
375 ST_SP2_INC(x0, x1, x, 4);
380 LD_GP4_INC(px, 1, f0, f1, f2, f3);
381 MUL4(f0, da_r, f1, da_r, f2, da_r, f3, da_r, f0, f1, f2, f3);
382 ST_GP4_INC(f0, f1, f2, f3, x, 1);
387 LD_GP2_INC(px, 1, f0, f1);
388 MUL2(f0, da_r, f1, da_r, f0, f1);
389 ST_GP2_INC(f0, f1, x, 1);
396 BLASLONG pref_offset;
398 pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
401 pref_offset = L1_DATA_LINESIZE - pref_offset;
402 pref_offset = pref_offset / sizeof(FLOAT);
404 x_pref = x + pref_offset + 64;
406 da_i_vec = COPY_FLOAT_TO_VECTOR(da_i);
407 da_i_vec_neg = -da_i_vec;
408 da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec);
410 da_r_vec = COPY_FLOAT_TO_VECTOR(da_r);
412 for (i = (n >> 5); i--;)
414 PREF_OFFSET(x_pref, 0);
415 PREF_OFFSET(x_pref, 32);
416 PREF_OFFSET(x_pref, 64);
417 PREF_OFFSET(x_pref, 96);
418 PREF_OFFSET(x_pref, 128);
419 PREF_OFFSET(x_pref, 160);
420 PREF_OFFSET(x_pref, 192);
421 PREF_OFFSET(x_pref, 224);
424 LD_SP16_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10,
425 x11, x12, x13, x14, x15);
426 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
428 MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
430 MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
432 MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
434 SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177);
435 SHF_W4_SP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_177);
436 SHF_W4_SP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_177);
437 SHF_W4_SP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_177);
438 FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
439 FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
440 FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11);
441 FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15);
442 ST_SP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11,
443 d12, d13, d14, d15, x, 4);
450 LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
451 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
453 MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
455 SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177);
456 SHF_W4_SP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_177);
457 FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
458 FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
459 ST_SP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, 4);
464 LD_SP4_INC(px, 4, x0, x1, x2, x3);
465 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
467 SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177);
468 FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
469 ST_SP4_INC(d0, d1, d2, d3, x, 4);
474 LD_SP2_INC(px, 4, x0, x1);
475 MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1);
476 SHF_W2_SP(d0, d1, d0, d1, SHF_177);
477 FMADD2(x0, x1, da_r_vec, d0, d1);
478 ST_SP2_INC(d0, d1, x, 4);
483 LD_GP4_INC(px, 1, f0, f1, f2, f3);
494 ST_GP4_INC(tp0, tp1, tp2, tp3, x, 1);
499 LD_GP2_INC(px, 1, f0, f1);
506 ST_GP2_INC(tp0, tp1, x, 1);
515 if ((0.0 == da_r) && (0.0 == da_i))
525 else if (0.0 == da_r)
527 da_i_vec = COPY_FLOAT_TO_VECTOR(da_i);
528 da_i_vec_neg = -da_i_vec;
529 da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec);
531 for (i = (n >> 4); i--;)
533 LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
534 x10, x11, x12, x13, x14, x15);
535 PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
536 PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7);
537 MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec,
539 MUL4(d4, da_i_vec, d5, da_i_vec, d6, da_i_vec, d7, da_i_vec,
596 LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
597 PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
598 MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec,
629 LD_SP4_INC(px, inc_x2, x0, x1, x2, x3);
630 PCKEV_D2_SP(x1, x0, x3, x2, d0, d1);
631 MUL2(d0, da_i_vec, d1, da_i_vec, d0, d1);
656 MUL4(f0, da_i, f1, -da_i, f2, da_i, f3, -da_i, f0, f1, f2, f3);
671 MUL2(f0, da_i, f1, -da_i, f0, f1);
678 else if (0.0 == da_i)
680 da_r_vec = COPY_FLOAT_TO_VECTOR(da_r);
682 for (i = (n >> 4); i--;)
684 LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
685 x10, x11, x12, x13, x14, x15);
686 PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
687 PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7);
688 MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec,
690 MUL4(d4, da_r_vec, d5, da_r_vec, d6, da_r_vec, d7, da_r_vec,
747 LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
748 PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
749 MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec,
780 LD_SP4_INC(px, inc_x2, x0, x1, x2, x3);
781 PCKEV_D2_SP(x1, x0, x3, x2, d0, d1);
782 MUL2(d0, da_r_vec, d1, da_r_vec, d0, d1);
807 MUL4(f0, da_r, f1, da_r, f2, da_r, f3, da_r, f0, f1, f2, f3);
822 MUL2(f0, da_r, f1, da_r, f0, f1);
831 da_i_vec = COPY_FLOAT_TO_VECTOR(da_i);
832 da_i_vec_neg = -da_i_vec;
833 da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec);
835 da_r_vec = COPY_FLOAT_TO_VECTOR(da_r);
837 for (i = (n >> 4); i--;)
839 LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
840 x10, x11, x12, x13, x14, x15);
841 PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
842 PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7);
843 MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec,
845 MUL4(d4, da_i_vec, d5, da_i_vec, d6, da_i_vec, d7, da_i_vec,
847 MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec,
849 MUL4(d4, da_r_vec, d5, da_r_vec, d6, da_r_vec, d7, da_r_vec,
851 SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
852 SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177);
853 ADD4(d0, x0, d1, x1, d2, x2, d3, x3, d0, d1, d2, d3);
854 ADD4(d4, x4, d5, x5, d6, x6, d7, x7, d4, d5, d6, d7);
910 LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
911 PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
912 MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec,
914 MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec,
916 SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
917 ADD4(d0, x0, d1, x1, d2, x2, d3, x3, d0, d1, d2, d3);
947 LD_SP4_INC(px, inc_x2, x0, x1, x2, x3);
948 PCKEV_D2_SP(x1, x0, x3, x2, d0, d1);
949 MUL2(d0, da_i_vec, d1, da_i_vec, x0, x1);
950 MUL2(d0, da_r_vec, d1, da_r_vec, d0, d1);
951 SHF_W2_SP(x0, x1, x0, x1, SHF_177);
952 ADD2(d0, x0, d1, x1, d0, d1);