1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 #define SGEMV_N_8x8() \
33 LD_SP2(pa0 + k, 4, t0, t1); \
34 LD_SP2(pa1 + k, 4, t2, t3); \
35 LD_SP2(pa2 + k, 4, t4, t5); \
36 LD_SP2(pa3 + k, 4, t6, t7); \
37 LD_SP2(pa4 + k, 4, t8, t9); \
38 LD_SP2(pa5 + k, 4, t10, t11); \
39 LD_SP2(pa6 + k, 4, t12, t13); \
40 LD_SP2(pa7 + k, 4, t14, t15); \
67 #define SGEMV_N_4x8() \
69 t0 = LD_SP(pa0 + k); \
70 t2 = LD_SP(pa1 + k); \
71 t4 = LD_SP(pa2 + k); \
72 t6 = LD_SP(pa3 + k); \
73 t8 = LD_SP(pa4 + k); \
74 t10 = LD_SP(pa5 + k); \
75 t12 = LD_SP(pa6 + k); \
76 t14 = LD_SP(pa7 + k); \
88 #define SGEMV_N_8x4() \
90 LD_SP2(pa0 + k, 4, t0, t1); \
91 LD_SP2(pa1 + k, 4, t2, t3); \
92 LD_SP2(pa2 + k, 4, t4, t5); \
93 LD_SP2(pa3 + k, 4, t6, t7); \
108 #define SGEMV_N_4x4() \
110 t0 = LD_SP(pa0 + k); \
111 t2 = LD_SP(pa1 + k); \
112 t4 = LD_SP(pa2 + k); \
113 t6 = LD_SP(pa3 + k); \
121 #define SGEMV_N_8x2() \
123 LD_SP2(pa0 + k, 4, t0, t1); \
124 LD_SP2(pa1 + k, 4, t2, t3); \
133 #define SGEMV_N_4x2() \
135 t0 = LD_SP(pa0 + k); \
136 t2 = LD_SP(pa1 + k); \
142 #define SLOAD_X8_SCALE_GP() \
143 temp0 = alpha * x[0 * inc_x]; \
144 temp1 = alpha * x[1 * inc_x]; \
145 temp2 = alpha * x[2 * inc_x]; \
146 temp3 = alpha * x[3 * inc_x]; \
147 temp4 = alpha * x[4 * inc_x]; \
148 temp5 = alpha * x[5 * inc_x]; \
149 temp6 = alpha * x[6 * inc_x]; \
150 temp7 = alpha * x[7 * inc_x]; \
152 tp0 = COPY_FLOAT_TO_VECTOR(temp0); \
153 tp1 = COPY_FLOAT_TO_VECTOR(temp1); \
154 tp2 = COPY_FLOAT_TO_VECTOR(temp2); \
155 tp3 = COPY_FLOAT_TO_VECTOR(temp3); \
156 tp4 = COPY_FLOAT_TO_VECTOR(temp4); \
157 tp5 = COPY_FLOAT_TO_VECTOR(temp5); \
158 tp6 = COPY_FLOAT_TO_VECTOR(temp6); \
159 tp7 = COPY_FLOAT_TO_VECTOR(temp7); \
161 #define SLOAD_X4_SCALE_GP() \
162 temp0 = alpha * x[0 * inc_x]; \
163 temp1 = alpha * x[1 * inc_x]; \
164 temp2 = alpha * x[2 * inc_x]; \
165 temp3 = alpha * x[3 * inc_x]; \
167 tp0 = COPY_FLOAT_TO_VECTOR(temp0); \
168 tp1 = COPY_FLOAT_TO_VECTOR(temp1); \
169 tp2 = COPY_FLOAT_TO_VECTOR(temp2); \
170 tp3 = COPY_FLOAT_TO_VECTOR(temp3); \
172 #define SLOAD_X8_SCALE_VECTOR() \
173 LD_SP2(x, 4, x0, x1); \
178 SPLATI_W4_SP(x0, tp0, tp1, tp2, tp3); \
179 SPLATI_W4_SP(x1, tp4, tp5, tp6, tp7); \
181 #define SLOAD_X4_SCALE_VECTOR() \
184 SPLATI_W4_SP(x0, tp0, tp1, tp2, tp3); \
186 #define SLOAD_Y8_GP() \
187 y0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 0 * inc_y))); \
188 y0 = (v4f32) __msa_insert_w((v4i32) y0, 1, *((int *)(y + 1 * inc_y))); \
189 y0 = (v4f32) __msa_insert_w((v4i32) y0, 2, *((int *)(y + 2 * inc_y))); \
190 y0 = (v4f32) __msa_insert_w((v4i32) y0, 3, *((int *)(y + 3 * inc_y))); \
191 y1 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 4 * inc_y))); \
192 y1 = (v4f32) __msa_insert_w((v4i32) y1, 1, *((int *)(y + 5 * inc_y))); \
193 y1 = (v4f32) __msa_insert_w((v4i32) y1, 2, *((int *)(y + 6 * inc_y))); \
194 y1 = (v4f32) __msa_insert_w((v4i32) y1, 3, *((int *)(y + 7 * inc_y))); \
196 #define SLOAD_Y4_GP() \
197 y0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 0 * inc_y))); \
198 y0 = (v4f32) __msa_insert_w((v4i32) y0, 1, *((int *)(y + 1 * inc_y))); \
199 y0 = (v4f32) __msa_insert_w((v4i32) y0, 2, *((int *)(y + 2 * inc_y))); \
200 y0 = (v4f32) __msa_insert_w((v4i32) y0, 3, *((int *)(y + 3 * inc_y))); \
202 #define SLOAD_Y8_VECTOR() LD_SP2(y, 4, y0, y1);
203 #define SLOAD_Y4_VECTOR() y0 = LD_SP(y);
205 #define SSTORE_Y8_GP() \
206 *((int *)(y + 0 * inc_y)) = __msa_copy_s_w((v4i32) y0, 0); \
207 *((int *)(y + 1 * inc_y)) = __msa_copy_s_w((v4i32) y0, 1); \
208 *((int *)(y + 2 * inc_y)) = __msa_copy_s_w((v4i32) y0, 2); \
209 *((int *)(y + 3 * inc_y)) = __msa_copy_s_w((v4i32) y0, 3); \
210 *((int *)(y + 4 * inc_y)) = __msa_copy_s_w((v4i32) y1, 0); \
211 *((int *)(y + 5 * inc_y)) = __msa_copy_s_w((v4i32) y1, 1); \
212 *((int *)(y + 6 * inc_y)) = __msa_copy_s_w((v4i32) y1, 2); \
213 *((int *)(y + 7 * inc_y)) = __msa_copy_s_w((v4i32) y1, 3); \
215 #define SSTORE_Y4_GP() \
216 *((int *)(y + 0 * inc_y)) = __msa_copy_s_w((v4i32) y0, 0); \
217 *((int *)(y + 1 * inc_y)) = __msa_copy_s_w((v4i32) y0, 1); \
218 *((int *)(y + 2 * inc_y)) = __msa_copy_s_w((v4i32) y0, 2); \
219 *((int *)(y + 3 * inc_y)) = __msa_copy_s_w((v4i32) y0, 3); \
221 #define SSTORE_Y8_VECTOR() ST_SP2(y0, y1, y, 4);
222 #define SSTORE_Y4_VECTOR() ST_SP(y0, y);
224 #define SGEMV_N_MSA() \
225 for (j = (n >> 3); j--;) \
232 for (i = (m >> 3); i--;) \
254 temp0 = alpha * x[0 * inc_x]; \
255 temp1 = alpha * x[1 * inc_x]; \
256 temp2 = alpha * x[2 * inc_x]; \
257 temp3 = alpha * x[3 * inc_x]; \
258 temp4 = alpha * x[4 * inc_x]; \
259 temp5 = alpha * x[5 * inc_x]; \
260 temp6 = alpha * x[6 * inc_x]; \
261 temp7 = alpha * x[7 * inc_x]; \
263 for (i = (m & 3); i--;) \
266 temp += temp0 * pa0[k]; \
267 temp += temp1 * pa1[k]; \
268 temp += temp2 * pa2[k]; \
269 temp += temp3 * pa3[k]; \
270 temp += temp4 * pa4[k]; \
271 temp += temp5 * pa5[k]; \
272 temp += temp6 * pa6[k]; \
273 temp += temp7 * pa7[k]; \
299 for (i = (m >> 3); i--;) \
321 temp0 = alpha * x[0 * inc_x]; \
322 temp1 = alpha * x[1 * inc_x]; \
323 temp2 = alpha * x[2 * inc_x]; \
324 temp3 = alpha * x[3 * inc_x]; \
326 for (i = (m & 3); i--;) \
329 temp += temp0 * pa0[k]; \
330 temp += temp1 * pa1[k]; \
331 temp += temp2 * pa2[k]; \
332 temp += temp3 * pa3[k]; \
350 temp0 = alpha * x[0 * inc_x]; \
351 temp1 = alpha * x[1 * inc_x]; \
353 tp0 = COPY_FLOAT_TO_VECTOR(temp0); \
354 tp1 = COPY_FLOAT_TO_VECTOR(temp1); \
359 for (i = (m >> 3); i--;) \
381 temp0 = alpha * x[0 * inc_x]; \
382 temp1 = alpha * x[1 * inc_x]; \
384 for (i = (m & 3); i--;) \
387 temp += temp0 * pa0[k]; \
388 temp += temp1 * pa1[k]; \
404 temp = alpha * x[0]; \
411 y[0] += temp * pa0[k]; \
418 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
419 BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
424 FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
425 FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
426 v4f32 v_alpha, x0, x1, y0, y1;
427 v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
428 v4f32 tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
430 v_alpha = COPY_FLOAT_TO_VECTOR(alpha);
441 if ((1 == inc_x) && (1 == inc_y))
443 #define SLOAD_X8_SCALE SLOAD_X8_SCALE_VECTOR
444 #define SLOAD_X4_SCALE SLOAD_X4_SCALE_VECTOR
445 #define SLOAD_Y8 SLOAD_Y8_VECTOR
446 #define SLOAD_Y4 SLOAD_Y4_VECTOR
447 #define SSTORE_Y8 SSTORE_Y8_VECTOR
448 #define SSTORE_Y4 SSTORE_Y4_VECTOR
452 #undef SLOAD_X8_SCALE
453 #undef SLOAD_X4_SCALE
461 #define SLOAD_X8_SCALE SLOAD_X8_SCALE_GP
462 #define SLOAD_X4_SCALE SLOAD_X4_SCALE_GP
463 #define SLOAD_Y8 SLOAD_Y8_VECTOR
464 #define SLOAD_Y4 SLOAD_Y4_VECTOR
465 #define SSTORE_Y8 SSTORE_Y8_VECTOR
466 #define SSTORE_Y4 SSTORE_Y4_VECTOR
470 #undef SLOAD_X8_SCALE
471 #undef SLOAD_X4_SCALE
479 #define SLOAD_X8_SCALE SLOAD_X8_SCALE_VECTOR
480 #define SLOAD_X4_SCALE SLOAD_X4_SCALE_VECTOR
481 #define SLOAD_Y8 SLOAD_Y8_GP
482 #define SLOAD_Y4 SLOAD_Y4_GP
483 #define SSTORE_Y8 SSTORE_Y8_GP
484 #define SSTORE_Y4 SSTORE_Y4_GP
488 #undef SLOAD_X8_SCALE
489 #undef SLOAD_X4_SCALE
497 #define SLOAD_X8_SCALE SLOAD_X8_SCALE_GP
498 #define SLOAD_X4_SCALE SLOAD_X4_SCALE_GP
499 #define SLOAD_Y8 SLOAD_Y8_GP
500 #define SLOAD_Y4 SLOAD_Y4_GP
501 #define SSTORE_Y8 SSTORE_Y8_GP
502 #define SSTORE_Y4 SSTORE_Y4_GP
506 #undef SLOAD_X8_SCALE
507 #undef SLOAD_X4_SCALE