1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 #define DGEMV_T_8x8() \
33 LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
34 LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
35 LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
36 LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
37 LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \
38 LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \
39 LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \
40 LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \
83 #define DGEMV_T_8x4() \
85 LD_DP2(pa0 + k, 2, t0, t1); \
86 LD_DP2(pa1 + k, 2, t4, t5); \
87 LD_DP2(pa2 + k, 2, t8, t9); \
88 LD_DP2(pa3 + k, 2, t12, t13); \
89 LD_DP2(pa4 + k, 2, t16, t17); \
90 LD_DP2(pa5 + k, 2, t20, t21); \
91 LD_DP2(pa6 + k, 2, t24, t25); \
92 LD_DP2(pa7 + k, 2, t28, t29); \
119 #define DGEMV_T_8x2() \
121 t0 = LD_DP(pa0 + k); \
122 t4 = LD_DP(pa1 + k); \
123 t8 = LD_DP(pa2 + k); \
124 t12 = LD_DP(pa3 + k); \
125 t16 = LD_DP(pa4 + k); \
126 t20 = LD_DP(pa5 + k); \
127 t24 = LD_DP(pa6 + k); \
128 t28 = LD_DP(pa7 + k); \
140 #define DGEMV_T_4x8() \
142 LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
143 LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
144 LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
145 LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
168 #define DGEMV_T_4x4() \
170 LD_DP2(pa0 + k, 2, t0, t1); \
171 LD_DP2(pa1 + k, 2, t4, t5); \
172 LD_DP2(pa2 + k, 2, t8, t9); \
173 LD_DP2(pa3 + k, 2, t12, t13); \
188 #define DGEMV_T_4x2() \
190 t0 = LD_DP(pa0 + k); \
191 t4 = LD_DP(pa1 + k); \
192 t8 = LD_DP(pa2 + k); \
193 t12 = LD_DP(pa3 + k); \
201 #define DGEMV_T_2x8() \
203 LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
204 LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
217 #define DGEMV_T_2x4() \
219 LD_DP2(pa0 + k, 2, t0, t1); \
220 LD_DP2(pa1 + k, 2, t4, t5); \
229 #define DGEMV_T_2x2() \
231 t0 = LD_DP(pa0 + k); \
232 t4 = LD_DP(pa1 + k); \
238 #define DLOAD_X8_GP() \
239 x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \
240 x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \
241 x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x))); \
242 x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((long long *)(x + 3 * inc_x))); \
243 x2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 4 * inc_x))); \
244 x2 = (v2f64) __msa_insert_d((v2i64) x2, 1, *((long long *)(x + 5 * inc_x))); \
245 x3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 6 * inc_x))); \
246 x3 = (v2f64) __msa_insert_d((v2i64) x3, 1, *((long long *)(x + 7 * inc_x))); \
248 #define DLOAD_X4_GP() \
249 x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \
250 x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \
251 x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x))); \
252 x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((long long *)(x + 3 * inc_x))); \
254 #define DLOAD_X2_GP() \
255 x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \
256 x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \
258 #define DLOAD_X8_VECTOR() LD_DP4(x, 2, x0, x1, x2, x3);
259 #define DLOAD_X4_VECTOR() LD_DP2(x, 2, x0, x1);
260 #define DLOAD_X2_VECTOR() x0 = LD_DP(x);
262 #define DGEMV_T_MSA() \
263 for (j = (n >> 3); j--;) \
277 for (i = (m >> 3); i--;) \
304 ILVRL_D2_DP(tp1, tp0, t0, t4); \
305 ILVRL_D2_DP(tp3, tp2, t1, t5); \
306 ILVRL_D2_DP(tp5, tp4, t2, t6); \
307 ILVRL_D2_DP(tp7, tp6, t3, t7); \
308 ADD2(t0, t4, t1, t5, t0, t1); \
309 ADD2(t2, t6, t3, t7, t2, t3); \
322 temp0 += pa0[k] * x[0]; \
323 temp1 += pa1[k] * x[0]; \
324 temp2 += pa2[k] * x[0]; \
325 temp3 += pa3[k] * x[0]; \
326 temp4 += pa4[k] * x[0]; \
327 temp5 += pa5[k] * x[0]; \
328 temp6 += pa6[k] * x[0]; \
329 temp7 += pa7[k] * x[0]; \
335 res0 = y[0 * inc_y]; \
336 res1 = y[1 * inc_y]; \
337 res2 = y[2 * inc_y]; \
338 res3 = y[3 * inc_y]; \
339 res4 = y[4 * inc_y]; \
340 res5 = y[5 * inc_y]; \
341 res6 = y[6 * inc_y]; \
342 res7 = y[7 * inc_y]; \
344 res0 += alpha * temp0; \
345 res1 += alpha * temp1; \
346 res2 += alpha * temp2; \
347 res3 += alpha * temp3; \
348 res4 += alpha * temp4; \
349 res5 += alpha * temp5; \
350 res6 += alpha * temp6; \
351 res7 += alpha * temp7; \
353 y[0 * inc_y] = res0; \
354 y[1 * inc_y] = res1; \
355 y[2 * inc_y] = res2; \
356 y[3 * inc_y] = res3; \
357 y[4 * inc_y] = res4; \
358 y[5 * inc_y] = res5; \
359 y[6 * inc_y] = res6; \
360 y[7 * inc_y] = res7; \
384 for (i = (m >> 3); i--;) \
411 ILVRL_D2_DP(tp1, tp0, t0, t4); \
412 ILVRL_D2_DP(tp3, tp2, t1, t5); \
413 ADD2(t0, t4, t1, t5, t0, t1); \
422 temp0 += pa0[k] * x[0]; \
423 temp1 += pa1[k] * x[0]; \
424 temp2 += pa2[k] * x[0]; \
425 temp3 += pa3[k] * x[0]; \
431 res0 = y[0 * inc_y]; \
432 res1 = y[1 * inc_y]; \
433 res2 = y[2 * inc_y]; \
434 res3 = y[3 * inc_y]; \
436 res0 += alpha * temp0; \
437 res1 += alpha * temp1; \
438 res2 += alpha * temp2; \
439 res3 += alpha * temp3; \
441 y[0 * inc_y] = res0; \
442 y[1 * inc_y] = res1; \
443 y[2 * inc_y] = res2; \
444 y[3 * inc_y] = res3; \
462 for (i = (m >> 3); i--;) \
489 ILVRL_D2_DP(tp1, tp0, t0, t4); \
498 temp0 += pa0[k] * x[0]; \
499 temp1 += pa1[k] * x[0]; \
504 res0 = y[0 * inc_y]; \
505 res1 = y[1 * inc_y]; \
507 res0 += alpha * temp0; \
508 res1 += alpha * temp1; \
510 y[0 * inc_y] = res0; \
511 y[1 * inc_y] = res1; \
528 temp0 += pa0[k] * x[0]; \
533 y[0] += alpha * temp0; \
539 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
540 BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
545 FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
546 FLOAT temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
547 FLOAT res0, res1, res2, res3, res4, res5, res6, res7;
548 v2f64 x0, x1, x2, x3;
549 v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
550 v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
551 v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
565 #define DLOAD_X8 DLOAD_X8_VECTOR
566 #define DLOAD_X4 DLOAD_X4_VECTOR
567 #define DLOAD_X2 DLOAD_X2_VECTOR
577 #define DLOAD_X8 DLOAD_X8_GP
578 #define DLOAD_X4 DLOAD_X4_GP
579 #define DLOAD_X2 DLOAD_X2_GP