1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 #define DGEMV_N_8x8() \
33 LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
34 LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
35 LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
36 LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
37 LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \
38 LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \
39 LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \
40 LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \
83 #define DGEMV_N_4x8() \
85 LD_DP2(pa0 + k, 2, t0, t1); \
86 LD_DP2(pa1 + k, 2, t4, t5); \
87 LD_DP2(pa2 + k, 2, t8, t9); \
88 LD_DP2(pa3 + k, 2, t12, t13); \
89 LD_DP2(pa4 + k, 2, t16, t17); \
90 LD_DP2(pa5 + k, 2, t20, t21); \
91 LD_DP2(pa6 + k, 2, t24, t25); \
92 LD_DP2(pa7 + k, 2, t28, t29); \
119 #define DGEMV_N_8x4() \
121 LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
122 LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
123 LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
124 LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
147 #define DGEMV_N_4x4() \
149 LD_DP2(pa0 + k, 2, t0, t1); \
150 LD_DP2(pa1 + k, 2, t4, t5); \
151 LD_DP2(pa2 + k, 2, t8, t9); \
152 LD_DP2(pa3 + k, 2, t12, t13); \
167 #define DGEMV_N_8x2() \
169 LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
170 LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
183 #define DGEMV_N_4x2() \
185 LD_DP2(pa0 + k, 2, t0, t1); \
186 LD_DP2(pa1 + k, 2, t4, t5); \
195 #define DLOAD_X8_SCALE_GP() \
196 temp0 = alpha * x[0 * inc_x]; \
197 temp1 = alpha * x[1 * inc_x]; \
198 temp2 = alpha * x[2 * inc_x]; \
199 temp3 = alpha * x[3 * inc_x]; \
200 temp4 = alpha * x[4 * inc_x]; \
201 temp5 = alpha * x[5 * inc_x]; \
202 temp6 = alpha * x[6 * inc_x]; \
203 temp7 = alpha * x[7 * inc_x]; \
205 tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \
206 tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \
207 tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \
208 tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \
209 tp4 = COPY_DOUBLE_TO_VECTOR(temp4); \
210 tp5 = COPY_DOUBLE_TO_VECTOR(temp5); \
211 tp6 = COPY_DOUBLE_TO_VECTOR(temp6); \
212 tp7 = COPY_DOUBLE_TO_VECTOR(temp7); \
214 #define DLOAD_X4_SCALE_GP() \
215 temp0 = alpha * x[0 * inc_x]; \
216 temp1 = alpha * x[1 * inc_x]; \
217 temp2 = alpha * x[2 * inc_x]; \
218 temp3 = alpha * x[3 * inc_x]; \
220 tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \
221 tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \
222 tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \
223 tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \
225 #define DLOAD_X8_SCALE_VECTOR() \
226 LD_DP4(x, 2, x0, x1, x2, x3); \
233 SPLATI_D2_DP(x0, tp0, tp1); \
234 SPLATI_D2_DP(x1, tp2, tp3); \
235 SPLATI_D2_DP(x2, tp4, tp5); \
236 SPLATI_D2_DP(x3, tp6, tp7); \
238 #define DLOAD_X4_SCALE_VECTOR() \
239 LD_DP2(x, 2, x0, x1); \
244 SPLATI_D2_DP(x0, tp0, tp1); \
245 SPLATI_D2_DP(x1, tp2, tp3); \
247 #define DLOAD_Y8_GP() \
248 y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \
249 y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \
250 y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \
251 y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \
252 y2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 4 * inc_y))); \
253 y2 = (v2f64) __msa_insert_d((v2i64) y2, 1, *((long long *)(y + 5 * inc_y))); \
254 y3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 6 * inc_y))); \
255 y3 = (v2f64) __msa_insert_d((v2i64) y3, 1, *((long long *)(y + 7 * inc_y))); \
257 #define DLOAD_Y4_GP() \
258 y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \
259 y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \
260 y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \
261 y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \
263 #define DLOAD_Y8_VECTOR() LD_DP4(y, 2, y0, y1, y2, y3);
264 #define DLOAD_Y4_VECTOR() LD_DP2(y, 2, y0, y1);
266 #define DSTORE_Y8_GP() \
267 *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \
268 *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \
269 *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \
270 *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \
271 *((long long *)(y + 4 * inc_y)) = __msa_copy_s_d((v2i64) y2, 0); \
272 *((long long *)(y + 5 * inc_y)) = __msa_copy_s_d((v2i64) y2, 1); \
273 *((long long *)(y + 6 * inc_y)) = __msa_copy_s_d((v2i64) y3, 0); \
274 *((long long *)(y + 7 * inc_y)) = __msa_copy_s_d((v2i64) y3, 1); \
276 #define DSTORE_Y4_GP() \
277 *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \
278 *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \
279 *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \
280 *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \
282 #define DSTORE_Y8_VECTOR() ST_DP4(y0, y1, y2, y3, y, 2);
283 #define DSTORE_Y4_VECTOR() ST_DP2(y0, y1, y, 2);
285 #define DGEMV_N_MSA() \
286 for (j = (n >> 3); j--;) \
293 for (i = (m >> 3); i--;) \
315 temp0 = alpha * x[0 * inc_x]; \
316 temp1 = alpha * x[1 * inc_x]; \
317 temp2 = alpha * x[2 * inc_x]; \
318 temp3 = alpha * x[3 * inc_x]; \
319 temp4 = alpha * x[4 * inc_x]; \
320 temp5 = alpha * x[5 * inc_x]; \
321 temp6 = alpha * x[6 * inc_x]; \
322 temp7 = alpha * x[7 * inc_x]; \
324 for (i = (m & 3); i--;) \
327 temp += temp0 * pa0[k]; \
328 temp += temp1 * pa1[k]; \
329 temp += temp2 * pa2[k]; \
330 temp += temp3 * pa3[k]; \
331 temp += temp4 * pa4[k]; \
332 temp += temp5 * pa5[k]; \
333 temp += temp6 * pa6[k]; \
334 temp += temp7 * pa7[k]; \
360 for (i = (m >> 3); i--;) \
382 temp0 = alpha * x[0 * inc_x]; \
383 temp1 = alpha * x[1 * inc_x]; \
384 temp2 = alpha * x[2 * inc_x]; \
385 temp3 = alpha * x[3 * inc_x]; \
387 for (i = (m & 3); i--;) \
390 temp += temp0 * pa0[k]; \
391 temp += temp1 * pa1[k]; \
392 temp += temp2 * pa2[k]; \
393 temp += temp3 * pa3[k]; \
411 temp0 = alpha * x[0 * inc_x]; \
412 temp1 = alpha * x[1 * inc_x]; \
414 tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \
415 tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \
420 for (i = (m >> 3); i--;) \
442 temp0 = alpha * x[0 * inc_x]; \
443 temp1 = alpha * x[1 * inc_x]; \
445 for (i = (m & 3); i--;) \
448 temp += temp0 * pa0[k]; \
449 temp += temp1 * pa1[k]; \
465 temp = alpha * x[0]; \
472 y[0] += temp * pa0[k]; \
478 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
479 BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
484 FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
485 FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
487 v2f64 x0, x1, x2, x3, y0, y1, y2, y3;
488 v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
489 v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
490 v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
492 v_alpha = COPY_DOUBLE_TO_VECTOR(alpha);
503 if ((1 == inc_x) && (1 == inc_y))
505 #define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR
506 #define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR
507 #define DLOAD_Y8 DLOAD_Y8_VECTOR
508 #define DLOAD_Y4 DLOAD_Y4_VECTOR
509 #define DSTORE_Y8 DSTORE_Y8_VECTOR
510 #define DSTORE_Y4 DSTORE_Y4_VECTOR
514 #undef DLOAD_X8_SCALE
515 #undef DLOAD_X4_SCALE
523 #define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP
524 #define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP
525 #define DLOAD_Y8 DLOAD_Y8_VECTOR
526 #define DLOAD_Y4 DLOAD_Y4_VECTOR
527 #define DSTORE_Y8 DSTORE_Y8_VECTOR
528 #define DSTORE_Y4 DSTORE_Y4_VECTOR
532 #undef DLOAD_X8_SCALE
533 #undef DLOAD_X4_SCALE
541 #define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR
542 #define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR
543 #define DLOAD_Y8 DLOAD_Y8_GP
544 #define DLOAD_Y4 DLOAD_Y4_GP
545 #define DSTORE_Y8 DSTORE_Y8_GP
546 #define DSTORE_Y4 DSTORE_Y4_GP
550 #undef DLOAD_X8_SCALE
551 #undef DLOAD_X4_SCALE
559 #define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP
560 #define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP
561 #define DLOAD_Y8 DLOAD_Y8_GP
562 #define DLOAD_Y4 DLOAD_Y4_GP
563 #define DSTORE_Y8 DSTORE_Y8_GP
564 #define DSTORE_Y4 DSTORE_Y4_GP
568 #undef DLOAD_X8_SCALE
569 #undef DLOAD_X4_SCALE